1 /** 2 Command line tool that filters TSV files. 3 4 This tool filters tab-delimited files based on numeric or string comparisons 5 against specific fields. See the helpText string for details. 6 7 Copyright (c) 2015-2020, eBay Inc. 8 Initially written by Jon Degenhardt 9 10 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 11 */ 12 module tsv_utils.tsv_filter; 13 14 import std.algorithm : canFind, equal, findSplit, max, min; 15 import std.conv : to; 16 import std.format : format; 17 import std.math : abs, isFinite, isInfinity, isNaN; 18 import std.range : walkLength; 19 import std.regex; 20 import std.stdio; 21 import std.string : isNumeric; 22 import std.typecons : tuple; 23 import std.uni: asLowerCase, toLower, byGrapheme; 24 25 /* The program has two main parts, command line arg processing and processing the input 26 * files. Much of the work is in command line arg processing. This sets up the tests run 27 * against each input line. The tests are an array of delegates (closures) run against the 28 * fields in the line. The tests are based on command line arguments, of which there is 29 * a lengthy set, one for each test. 30 */ 31 32 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 33 34 /** Main program. Invokes command line arg processing and tsv-filter to perform 35 * the real work. Any errors are caught and reported. 36 */ 37 int main(string[] cmdArgs) 38 { 39 /* When running in DMD code coverage mode, turn on report merging. */ 40 version(D_Coverage) version(DigitalMars) 41 { 42 import core.runtime : dmd_coverSetMerge; 43 dmd_coverSetMerge(true); 44 } 45 46 TsvFilterOptions cmdopt; 47 const r = cmdopt.processArgs(cmdArgs); 48 if (!r[0]) return r[1]; 49 version(LDC_Profile) 50 { 51 import ldc.profile : resetAll; 52 resetAll(); 53 } 54 try tsvFilter(cmdopt, cmdArgs[1..$]); 55 catch (Exception e) 56 { 57 stderr.writefln("Error [%s]: %s", cmdopt.programName, e.msg); 58 return 1; 59 } 60 return 0; 61 } 62 63 immutable helpText = q"EOS 64 Synopsis: tsv-filter [options] [file...] 65 66 Filter tab-delimited files for matching lines via comparison tests against 67 individual fields. Use '--help-verbose' for a more detailed description. 68 69 Global options: 70 --help-verbose Print full help. 71 --help-options Print the options list by itself. 72 --V|version Print version information and exit. 73 --H|header Treat the first line of each file as a header. 74 --or Evaluate tests as an OR rather than an AND clause. 75 --v|invert Invert the filter, printing lines that do not match. 76 --d|delimiter CHR Field delimiter. Default: TAB. 77 78 Operators: 79 * Test if a field is empty (no characters) or blank (empty or whitespace only). 80 Syntax: --empty|not-empty|blank|not-blank FIELD 81 Example: --empty 5 // True if field 5 is empty 82 83 * Test if a field is numeric, finite, NaN, or infinity 84 Syntax: --is-numeric|is-finite|is-nan|is-infinity FIELD 85 Example: --is-numeric 5 --gt 5:100 // Ensure field 5 is numeric before --gt test. 86 87 * Compare a field to a number (integer or float) 88 Syntax: --eq|ne|lt|le|gt|ge FIELD:NUM 89 Example: --lt 5:1000 --gt 2:0.5 // True if (field 5 < 1000) and (field 2 > 0.5) 90 91 * Compare a field to a string 92 Syntax: --str-eq|str-ne FIELD:STR 93 Example: --str-eq 3:abc // True if field 3 is "abc" 94 95 * Test if a field contains a string (substring search) 96 Syntax: --str-in-fld|str-not-in-fld|istr-in-fld|istr-not-in-fld FIELD:STR 97 Example: --str-in-fld 1:hello // True if field 1 contains "hello" 98 99 * Test if a field matches a regular expression. 100 Syntax: --regex|iregex|not-regex|not-iregex FIELD:REGEX 101 Example: --regex '3:ab*c' // True if field 3 contains "ac", "abc", "abbc", etc. 102 103 * Test a field's character or byte length 104 Syntax: --char-len-[le|lt|ge|gt|eq|ne] FIELD:NUM 105 --byte-len-[le|lt|ge|gt|eq|ne] FIELD:NUM 106 Example: --char-len-lt 2:10 // True if field 2 is less than 10 characters long. 107 --byte-len-gt 2:10 // True if field 2 is greater than 10 bytes long. 108 109 * Field to field comparisons - Similar to field vs literal comparisons, but field vs field. 110 Syntax: --ff-eq|ff-ne|ff-lt|ff-le|ff-gt|ff-ge FIELD1:FIELD2 111 --ff-str-eq|ff-str-ne|ff-istr-eq|ff-istr-ne FIELD1:FIELD2 112 Example: --ff-eq 2:4 // True if fields 2 and 4 are numerically equivalent 113 --ff-str-eq 2:4 // True if fields 2 and 4 are the same strings 114 115 * Field to field difference comparisons - Absolute and relative difference 116 Syntax: --ff-absdiff-le|ff-absdiff-gt FIELD1:FIELD2:NUM 117 --ff-reldiff-le|ff-reldiff-gt FIELD1:FIELD2:NUM 118 Example: --ff-absdiff-lt 1:3:0.25 // True if abs(field1 - field2) < 0.25 119 120 EOS"; 121 122 immutable helpTextVerbose = q"EOS 123 Synopsis: tsv-filter [options] [file...] 124 125 Filter lines of tab-delimited files via comparison tests against fields. Multiple 126 tests can be specified, by default they are evaluated as AND clause. Lines 127 satisfying the tests are written to standard output. 128 129 Typical test syntax is '--op field:value', where 'op' is an operator, 'field' is a 130 1-based field index, and 'value' is the comparison basis. For example, '--lt 3:500' 131 tests if field 3 is less than 500. A more complete example: 132 133 tsv-filter --header --gt 1:50 --lt 1:100 --le 2:1000 data.tsv 134 135 This outputs all lines from file data.tsv where field 1 is greater than 50 and less 136 than 100, and field 2 is less than or equal to 1000. The header is also output. 137 138 Field lists can be used to specify multiple fields at once. For example: 139 140 tsv-filter --not-blank 1-10 --str-ne 1,2,5:'--' data.tsv 141 142 tests that fields 1-10 are not blank and fields 1,2,5 are not "--". 143 144 Tests available include: 145 * Test if a field is empty (no characters) or blank (empty or whitespace only). 146 * Test if a field is interpretable as a number, a finite number, NaN, or Infinity. 147 * Compare a field to a number - Numeric equality and relational tests. 148 * Compare a field to a string - String equality and relational tests. 149 * Test if a field matches a regular expression. Case sensitive or insensitive. 150 * Test if a field contains a string. Sub-string search, case sensitive or insensitive. 151 * Test a field's character or byte length. 152 * Field to field comparisons - Similar to the other tests, except comparing 153 one field to another in the same line. 154 155 Details: 156 * The run is aborted if there are not enough fields in an input line. 157 * Numeric tests will fail and abort the run if a field cannot be interpreted as a 158 number. This includes fields with no text. To avoid this use '--is-numeric' or 159 '--is-finite' prior to the numeric test. For example, '--is-numeric 5 --gt 5:100' 160 ensures field 5 is numeric before running the --gt test. 161 * Regular expression syntax is defined by the D programming language. They follow 162 common conventions (perl, python, etc.). Most common forms work as expected. 163 164 Options: 165 EOS"; 166 167 immutable helpTextOptions = q"EOS 168 Synopsis: tsv-filter [options] [file...] 169 170 Options: 171 EOS"; 172 173 /* The next blocks of code define the structure of the boolean tests run against input lines. 174 * This includes function and delegate (closure) signatures, creation mechanisms, option 175 * handlers, etc. Command line arg processing to build the test structure. 176 */ 177 178 /* FieldsPredicate delegate signature - Each input line is run against a set of boolean 179 * tests. Each test is a 'FieldsPredicate'. A FieldsPredicate is a delegate (closure) 180 * containing all info about the test except the field values of the line being tested. 181 * These delegates are created as part of command line arg processing. The wrapped data 182 * includes operation, field indexes, literal values, etc. At run-time the delegate is 183 * passed one argument, the split input line. 184 */ 185 alias FieldsPredicate = bool delegate(const char[][] fields); 186 187 /* FieldsPredicate function signatures - These aliases represent the different function 188 * signatures used in FieldsPredicate delegates. Each alias has a corresponding 'make' 189 * function. The 'make' function takes a real predicate function and closure args and 190 * returns a FieldsPredicate delegate. Predicates types are: 191 * 192 * - FieldUnaryPredicate - Test based on a single field. (e.g. --empty 4) 193 * - FieldVsNumberPredicate - Test based on a field index (used to get the field value) 194 * and a fixed numeric value. For example, field 2 less than 100 (--lt 2:100). 195 * - FieldVsStringPredicate - Test based on a field and a string. (e.g. --str-eq 2:abc) 196 * - FieldVsIStringPredicate - Case-insensitive test based on a field and a string. 197 * (e.g. --istr-eq 2:abc) 198 * - FieldVsRegexPredicate - Test based on a field and a regex. (e.g. --regex '2:ab*c') 199 * - FieldVsFieldPredicate - Test based on two fields. (e.g. --ff-le 2:4). 200 * 201 * An actual FieldsPredicate takes the fields from the line and the closure args and 202 * runs the test. For example, a function testing if a field is less than a specific 203 * value would pull the specified field from the fields array, convert the string to 204 * a number, then run the less-than test. 205 */ 206 alias FieldUnaryPredicate = bool function(const char[][] fields, size_t index); 207 alias FieldVsNumberPredicate = bool function(const char[][] fields, size_t index, double value); 208 alias FieldVsStringPredicate = bool function(const char[][] fields, size_t index, string value); 209 alias FieldVsIStringPredicate = bool function(const char[][] fields, size_t index, dstring value); 210 alias FieldVsRegexPredicate = bool function(const char[][] fields, size_t index, Regex!char value); 211 alias FieldVsFieldPredicate = bool function(const char[][] fields, size_t index1, size_t index2); 212 alias FieldFieldNumPredicate = bool function(const char[][] fields, size_t index1, size_t index2, double value); 213 214 FieldsPredicate makeFieldUnaryDelegate(FieldUnaryPredicate fn, size_t index) 215 { 216 return fields => fn(fields, index); 217 } 218 219 FieldsPredicate makeFieldVsNumberDelegate(FieldVsNumberPredicate fn, size_t index, double value) 220 { 221 return fields => fn(fields, index, value); 222 } 223 224 FieldsPredicate makeFieldVsStringDelegate(FieldVsStringPredicate fn, size_t index, string value) 225 { 226 return fields => fn(fields, index, value); 227 } 228 229 FieldsPredicate makeFieldVsIStringDelegate(FieldVsIStringPredicate fn, size_t index, dstring value) 230 { 231 return fields => fn(fields, index, value); 232 } 233 234 FieldsPredicate makeFieldVsRegexDelegate(FieldVsRegexPredicate fn, size_t index, Regex!char value) 235 { 236 return fields => fn(fields, index, value); 237 } 238 239 FieldsPredicate makeFieldVsFieldDelegate(FieldVsFieldPredicate fn, size_t index1, size_t index2) 240 { 241 return fields => fn(fields, index1, index2); 242 } 243 244 FieldsPredicate makeFieldFieldNumDelegate(FieldFieldNumPredicate fn, size_t index1, size_t index2, double value) 245 { 246 return fields => fn(fields, index1, index2, value); 247 } 248 249 /* Predicate functions - These are the actual functions used in a FieldsPredicate. They 250 * are a direct reflection of the operators available via command line args. Each matches 251 * one of the FieldsPredicate function aliases defined above. 252 */ 253 bool fldEmpty(const char[][] fields, size_t index) { return fields[index].length == 0; } 254 bool fldNotEmpty(const char[][] fields, size_t index) { return fields[index].length != 0; } 255 bool fldBlank(const char[][] fields, size_t index) { return cast(bool) fields[index].matchFirst(ctRegex!`^\s*$`); } 256 bool fldNotBlank(const char[][] fields, size_t index) { return !fields[index].matchFirst(ctRegex!`^\s*$`); } 257 258 bool fldIsNumeric(const char[][] fields, size_t index) { return fields[index].isNumeric; } 259 bool fldIsFinite(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isFinite; } 260 bool fldIsNaN(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isNaN; } 261 bool fldIsInfinity(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isInfinity; } 262 263 bool numLE(const char[][] fields, size_t index, double val) { return fields[index].to!double <= val; } 264 bool numLT(const char[][] fields, size_t index, double val) { return fields[index].to!double < val; } 265 bool numGE(const char[][] fields, size_t index, double val) { return fields[index].to!double >= val; } 266 bool numGT(const char[][] fields, size_t index, double val) { return fields[index].to!double > val; } 267 bool numEQ(const char[][] fields, size_t index, double val) { return fields[index].to!double == val; } 268 bool numNE(const char[][] fields, size_t index, double val) { return fields[index].to!double != val; } 269 270 bool strLE(const char[][] fields, size_t index, string val) { return fields[index] <= val; } 271 bool strLT(const char[][] fields, size_t index, string val) { return fields[index] < val; } 272 bool strGE(const char[][] fields, size_t index, string val) { return fields[index] >= val; } 273 bool strGT(const char[][] fields, size_t index, string val) { return fields[index] > val; } 274 bool strEQ(const char[][] fields, size_t index, string val) { return fields[index] == val; } 275 bool strNE(const char[][] fields, size_t index, string val) { return fields[index] != val; } 276 bool strInFld(const char[][] fields, size_t index, string val) { return fields[index].canFind(val); } 277 bool strNotInFld(const char[][] fields, size_t index, string val) { return !fields[index].canFind(val); } 278 279 /* Note: For istr predicates, the command line value has been lower-cased by fieldVsIStringOptionHander. 280 */ 281 bool istrEQ(const char[][] fields, size_t index, dstring val) { return fields[index].asLowerCase.equal(val); } 282 bool istrNE(const char[][] fields, size_t index, dstring val) { return !fields[index].asLowerCase.equal(val); } 283 bool istrInFld(const char[][] fields, size_t index, dstring val) { return fields[index].asLowerCase.canFind(val); } 284 bool istrNotInFld(const char[][] fields, size_t index, dstring val) { return !fields[index].asLowerCase.canFind(val); } 285 286 /* Note: Case-sensitivity is built into the regex value, so these regex predicates are 287 * used for both case-sensitive and case-insensitive regex operators. 288 */ 289 bool regexMatch(const char[][] fields, size_t index, Regex!char val) { return cast(bool) fields[index].matchFirst(val); } 290 bool regexNotMatch(const char[][] fields, size_t index, Regex!char val) { return !fields[index].matchFirst(val); } 291 292 bool charLenLE(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength <= val; } 293 bool charLenLT(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength < val; } 294 bool charLenGE(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength >= val; } 295 bool charLenGT(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength > val; } 296 bool charLenEQ(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength == val; } 297 bool charLenNE(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength != val; } 298 299 bool byteLenLE(const char[][] fields, size_t index, double val) { return fields[index].length <= val; } 300 bool byteLenLT(const char[][] fields, size_t index, double val) { return fields[index].length < val; } 301 bool byteLenGE(const char[][] fields, size_t index, double val) { return fields[index].length >= val; } 302 bool byteLenGT(const char[][] fields, size_t index, double val) { return fields[index].length > val; } 303 bool byteLenEQ(const char[][] fields, size_t index, double val) { return fields[index].length == val; } 304 bool byteLenNE(const char[][] fields, size_t index, double val) { return fields[index].length != val; } 305 306 bool ffLE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double <= fields[index2].to!double; } 307 bool ffLT(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double < fields[index2].to!double; } 308 bool ffGE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double >= fields[index2].to!double; } 309 bool ffGT(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double > fields[index2].to!double; } 310 bool ffEQ(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double == fields[index2].to!double; } 311 bool ffNE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double != fields[index2].to!double; } 312 bool ffStrEQ(const char[][] fields, size_t index1, size_t index2) { return fields[index1] == fields[index2]; } 313 bool ffStrNE(const char[][] fields, size_t index1, size_t index2) { return fields[index1] != fields[index2]; } 314 bool ffIStrEQ(const char[][] fields, size_t index1, size_t index2) 315 { 316 return equal(fields[index1].asLowerCase, fields[index2].asLowerCase); 317 } 318 bool ffIStrNE(const char[][] fields, size_t index1, size_t index2) 319 { 320 return !equal(fields[index1].asLowerCase, fields[index2].asLowerCase); 321 } 322 323 auto AbsDiff(double v1, double v2) { return (v1 - v2).abs; } 324 auto RelDiff(double v1, double v2) { return (v1 - v2).abs / min(v1.abs, v2.abs); } 325 326 bool ffAbsDiffLE(const char[][] fields, size_t index1, size_t index2, double value) 327 { 328 return AbsDiff(fields[index1].to!double, fields[index2].to!double) <= value; 329 } 330 bool ffAbsDiffGT(const char[][] fields, size_t index1, size_t index2, double value) 331 { 332 return AbsDiff(fields[index1].to!double, fields[index2].to!double) > value; 333 } 334 bool ffRelDiffLE(const char[][] fields, size_t index1, size_t index2, double value) 335 { 336 return RelDiff(fields[index1].to!double, fields[index2].to!double) <= value; 337 } 338 bool ffRelDiffGT(const char[][] fields, size_t index1, size_t index2, double value) 339 { 340 return RelDiff(fields[index1].to!double, fields[index2].to!double) > value; 341 } 342 343 /* Command line option handlers - There is a command line option handler for each 344 * predicate type. That is, one each for FieldUnaryPredicate, FieldVsNumberPredicate, 345 * etc. Option handlers are passed the tests array, the predicate function, and the 346 * command line option arguments. A FieldsPredicate delegate is created and appended to 347 * the tests array. An exception is thrown if errors are detected while processing the 348 * option, the error text is intended for the end user. 349 * 350 * These option handlers have similar functionality, differing in option processing and 351 * error message generation. fieldVsNumberOptionHandler is described as an example. It 352 * handles command options such as '--lt 3:1000', which tests field 3 for a values less 353 * than 1000. It is passed the tests array, the 'numLE' function to use for the test, and 354 * the string "3:1000" representing the option value. It parses the option value into 355 * field index (unsigned int) and value (double). These are wrapped in a FieldsPredicate 356 * which is added to the tests array. An error is signaled if the option string is invalid. 357 * 358 * During processing, fields indexes are converted from one-based to zero-based. As an 359 * optimization, the maximum field index is also tracked. This allows early termination of 360 * line splitting. 361 */ 362 void fieldUnaryOptionHandler( 363 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldUnaryPredicate fn, string option, string optionVal) 364 { 365 import std.range : enumerate; 366 import std.typecons : Yes, No; 367 import tsv_utils.common.utils : parseFieldList; 368 369 try foreach (fieldNum, fieldIndex; 370 optionVal.parseFieldList!(size_t, Yes.convertToZeroBasedIndex).enumerate(1)) 371 { 372 tests ~= makeFieldUnaryDelegate(fn, fieldIndex); 373 maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex; 374 } 375 catch (Exception e) 376 { 377 import std.format : format; 378 e.msg = format("[--%s %s]. %s\n Expected: '--%s <field>' or '--%s <field-list>'.", 379 option, optionVal, e.msg, option, option); 380 throw e; 381 } 382 } 383 384 void fieldVsNumberOptionHandler( 385 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsNumberPredicate fn, string option, string optionVal) 386 { 387 import std.range : enumerate; 388 import std.typecons : Yes, No; 389 import tsv_utils.common.utils : parseFieldList; 390 391 auto formatErrorMsg(string option, string optionVal, string errorMessage="") 392 { 393 import std.format; 394 395 string optionalSpace = (errorMessage.length == 0) ? "" : " "; 396 return format( 397 "Invalid option: '--%s %s'.%s%s\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val> where <val> is a number.", 398 option, optionVal, optionalSpace, errorMessage, option, option); 399 } 400 401 immutable valSplit = findSplit(optionVal, ":"); 402 403 if (valSplit[1].length == 0 || valSplit[2].length == 0) 404 { 405 throw new Exception(formatErrorMsg(option, optionVal)); 406 } 407 408 double value; 409 try value = valSplit[2].to!double; 410 catch (Exception e) 411 { 412 throw new Exception(formatErrorMsg(option, optionVal, e.msg)); 413 } 414 415 try foreach (fieldNum, fieldIndex; 416 valSplit[0].parseFieldList!(size_t, Yes.convertToZeroBasedIndex).enumerate(1)) 417 { 418 tests ~= makeFieldVsNumberDelegate(fn, fieldIndex, value); 419 maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex; 420 } 421 catch (Exception e) 422 { 423 import std.format : format; 424 e.msg = format( 425 "[--%s %s]. %s\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val> where <val> is a number.", 426 option, optionVal, e.msg, option, option); 427 throw e; 428 } 429 } 430 431 void fieldVsStringOptionHandler( 432 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsStringPredicate fn, string option, string optionVal) 433 { 434 import std.range : enumerate; 435 import std.typecons : Yes, No; 436 import tsv_utils.common.utils : parseFieldList; 437 438 immutable valSplit = findSplit(optionVal, ":"); 439 if (valSplit[1].length == 0 || valSplit[2].length == 0) 440 { 441 throw new Exception( 442 format("Invalid option: '--%s %s'.\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val>' where <val> is a string.", 443 option, optionVal, option, option)); 444 } 445 446 string value = valSplit[2].to!string; 447 448 try foreach (fieldNum, fieldIndex; 449 valSplit[0].parseFieldList!(size_t, Yes.convertToZeroBasedIndex).enumerate(1)) 450 { 451 tests ~= makeFieldVsStringDelegate(fn, fieldIndex, value); 452 maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex; 453 } 454 catch (Exception e) 455 { 456 import std.format : format; 457 e.msg = format( 458 "[--%s %s]. %s\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val>' where <val> is a string.", 459 option, optionVal, e.msg, option, option); 460 throw e; 461 } 462 } 463 464 /* The fieldVsIStringOptionHandler lower-cases the command line argument, assuming the 465 * case-insensitive comparison will be done on lower-cased values. 466 */ 467 void fieldVsIStringOptionHandler( 468 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsIStringPredicate fn, string option, string optionVal) 469 { 470 import std.range : enumerate; 471 import std.typecons : Yes, No; 472 import tsv_utils.common.utils : parseFieldList; 473 474 immutable valSplit = findSplit(optionVal, ":"); 475 if (valSplit[1].length == 0 || valSplit[2].length == 0) 476 { 477 throw new Exception( 478 format("Invalid option: '--%s %s'.\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val>' where <val> is a string.", 479 option, optionVal, option, option)); 480 } 481 482 string value = valSplit[2].to!string; 483 484 try foreach (fieldNum, fieldIndex; 485 valSplit[0].parseFieldList!(size_t, Yes.convertToZeroBasedIndex).enumerate(1)) 486 { 487 tests ~= makeFieldVsIStringDelegate(fn, fieldIndex, value.to!dstring.toLower); 488 maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex; 489 } 490 catch (Exception e) 491 { 492 import std.format : format; 493 e.msg = format( 494 "[--%s %s]. %s\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val>' where <val> is a string.", 495 option, optionVal, e.msg, option, option); 496 throw e; 497 } 498 } 499 500 void fieldVsRegexOptionHandler( 501 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsRegexPredicate fn, string option, string optionVal, 502 bool caseSensitive) 503 { 504 import std.range : enumerate; 505 import std.typecons : Yes, No; 506 import tsv_utils.common.utils : parseFieldList; 507 508 immutable valSplit = findSplit(optionVal, ":"); 509 if (valSplit[1].length == 0 || valSplit[2].length == 0) 510 { 511 throw new Exception( 512 format("Invalid option: '--%s %s'.\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val>' where <val> is a regular expression.", 513 option, optionVal, option, option)); 514 } 515 516 Regex!char value; 517 try 518 { 519 immutable modifiers = caseSensitive ? "" : "i"; 520 value = regex(valSplit[2], modifiers); 521 } 522 catch (Exception e) 523 { 524 throw new Exception( 525 format("Invalid regular expression: '--%s %s'. %s\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val>' where <val> is a regular expression.", 526 option, optionVal, e.msg, option, option)); 527 } 528 529 try foreach (fieldNum, fieldIndex; 530 valSplit[0].parseFieldList!(size_t, Yes.convertToZeroBasedIndex).enumerate(1)) 531 { 532 tests ~= makeFieldVsRegexDelegate(fn, fieldIndex, value); 533 maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex; 534 } 535 catch (Exception e) 536 { 537 import std.format : format; 538 e.msg = format( 539 "[--%s %s]. %s\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val>' where <val> is a regular expression.", 540 option, optionVal, e.msg, option, option); 541 throw e; 542 } 543 } 544 545 void fieldVsFieldOptionHandler( 546 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsFieldPredicate fn, string option, string optionVal) 547 { 548 immutable valSplit = findSplit(optionVal, ":"); 549 if (valSplit[1].length == 0 || valSplit[2].length == 0) 550 { 551 throw new Exception( 552 format("Invalid option: '--%s %s'. Expected: '--%s <field1>:<field2>' where fields are 1-upped integers.", 553 option, optionVal, option)); 554 } 555 size_t field1; 556 size_t field2; 557 try 558 { 559 field1 = valSplit[0].to!size_t; 560 field2 = valSplit[2].to!size_t; 561 } 562 catch (Exception e) 563 { 564 throw new Exception( 565 format("Invalid values in option: '--%s %s'. Expected: '--%s <field1>:<field2>' where fields are 1-upped integers.", 566 option, optionVal, option)); 567 } 568 569 if (field1 == 0 || field2 == 0) 570 { 571 throw new Exception( 572 format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal)); 573 } 574 575 if (field1 == field2) 576 { 577 throw new Exception( 578 format("Invalid option: '--%s %s'. Field1 and field2 must be different fields", option, optionVal)); 579 } 580 581 immutable size_t zeroBasedIndex1 = field1 - 1; 582 immutable size_t zeroBasedIndex2 = field2 - 1; 583 tests ~= makeFieldVsFieldDelegate(fn, zeroBasedIndex1, zeroBasedIndex2); 584 maxFieldIndex = max(maxFieldIndex, zeroBasedIndex1, zeroBasedIndex2); 585 } 586 587 588 void fieldFieldNumOptionHandler( 589 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldFieldNumPredicate fn, string option, string optionVal) 590 { 591 size_t field1; 592 size_t field2; 593 double value; 594 immutable valSplit = findSplit(optionVal, ":"); 595 auto invalidOption = (valSplit[1].length == 0 || valSplit[2].length == 0); 596 597 if (!invalidOption) 598 { 599 immutable valSplit2 = findSplit(valSplit[2], ":"); 600 invalidOption = (valSplit2[1].length == 0 || valSplit2[2].length == 0); 601 602 if (!invalidOption) 603 { 604 try 605 { 606 field1 = valSplit[0].to!size_t; 607 field2 = valSplit2[0].to!size_t; 608 value = valSplit2[2].to!double; 609 } 610 catch (Exception e) 611 { 612 invalidOption = true; 613 } 614 } 615 } 616 617 if (invalidOption) 618 { 619 throw new Exception( 620 format("Invalid values in option: '--%s %s'. Expected: '--%s <field1>:<field2>:<num>' where fields are 1-upped integers.", 621 option, optionVal, option)); 622 } 623 if (field1 == 0 || field2 == 0) 624 { 625 throw new Exception( 626 format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal)); 627 } 628 if (field1 == field2) 629 { 630 throw new Exception( 631 format("Invalid option: '--%s %s'. Field1 and field2 must be different fields", option, optionVal)); 632 } 633 634 immutable size_t zeroBasedIndex1 = field1 - 1; 635 immutable size_t zeroBasedIndex2 = field2 - 1; 636 tests ~= makeFieldFieldNumDelegate(fn, zeroBasedIndex1, zeroBasedIndex2, value); 637 maxFieldIndex = max(maxFieldIndex, zeroBasedIndex1, zeroBasedIndex2); 638 } 639 640 /** Command line options - This struct holds the results of command line option processing. 641 * It also has a method, processArgs, that invokes command line arg processing. 642 */ 643 struct TsvFilterOptions 644 { 645 string programName; 646 FieldsPredicate[] tests; // Derived from tests 647 size_t maxFieldIndex; // Derived from tests 648 bool hasHeader = false; // --H|header 649 bool invert = false; // --invert 650 bool disjunct = false; // --or 651 char delim = '\t'; // --delimiter 652 bool helpVerbose = false; // --help-verbose 653 bool helpOptions = false; // --help-options 654 bool versionWanted = false; // --V|version 655 656 /* Returns a tuple. First value is true if command line arguments were successfully 657 * processed and execution should continue, or false if an error occurred or the user 658 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 659 * 660 * Returning true (execution continues) means args have been validated and the 661 * tests array has been established. 662 */ 663 auto processArgs (ref string[] cmdArgs) 664 { 665 import std.getopt; 666 import std.path : baseName, stripExtension; 667 import tsv_utils.common.getopt_inorder; 668 669 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 670 671 /* Command option handlers - One handler for each option. These conform to the 672 * getopt required handler signature, and separate knowledge the specific command 673 * option text from the option processing. 674 */ 675 void handlerFldEmpty(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldEmpty, option, value); } 676 void handlerFldNotEmpty(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldNotEmpty, option, value); } 677 void handlerFldBlank(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldBlank, option, value); } 678 void handlerFldNotBlank(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldNotBlank, option, value); } 679 680 void handlerFldIsNumeric(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsNumeric, option, value); } 681 void handlerFldIsFinite(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsFinite, option, value); } 682 void handlerFldIsNaN(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsNaN, option, value); } 683 void handlerFldIsInfinity(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsInfinity, option, value); } 684 685 void handlerNumLE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numLE, option, value); } 686 void handlerNumLT(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numLT, option, value); } 687 void handlerNumGE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numGE, option, value); } 688 void handlerNumGT(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numGT, option, value); } 689 void handlerNumEQ(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numEQ, option, value); } 690 void handlerNumNE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numNE, option, value); } 691 692 void handlerStrLE(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strLE, option, value); } 693 void handlerStrLT(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strLT, option, value); } 694 void handlerStrGE(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strGE, option, value); } 695 void handlerStrGT(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strGT, option, value); } 696 void handlerStrEQ(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strEQ, option, value); } 697 void handlerStrNE(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strNE, option, value); } 698 699 void handlerStrInFld(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strInFld, option, value); } 700 void handlerStrNotInFld(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strNotInFld, option, value); } 701 702 void handlerIStrEQ(string option, string value) { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrEQ, option, value); } 703 void handlerIStrNE(string option, string value) { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrNE, option, value); } 704 void handlerIStrInFld(string option, string value) { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrInFld, option, value); } 705 void handlerIStrNotInFld(string option, string value) { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrNotInFld, option, value); } 706 707 void handlerRegexMatch(string option, string value) { fieldVsRegexOptionHandler(tests, maxFieldIndex, ®exMatch, option, value, true); } 708 void handlerRegexNotMatch(string option, string value) { fieldVsRegexOptionHandler(tests, maxFieldIndex, ®exNotMatch, option, value, true); } 709 void handlerIRegexMatch(string option, string value) { fieldVsRegexOptionHandler(tests, maxFieldIndex, ®exMatch, option, value, false); } 710 void handlerIRegexNotMatch(string option, string value) { fieldVsRegexOptionHandler(tests, maxFieldIndex, ®exNotMatch, option, value, false); } 711 712 void handlerCharLenLE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &charLenLE, option, value); } 713 void handlerCharLenLT(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &charLenLT, option, value); } 714 void handlerCharLenGE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &charLenGE, option, value); } 715 void handlerCharLenGT(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &charLenGT, option, value); } 716 void handlerCharLenEQ(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &charLenEQ, option, value); } 717 void handlerCharLenNE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &charLenNE, option, value); } 718 719 void handlerByteLenLE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &byteLenLE, option, value); } 720 void handlerByteLenLT(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &byteLenLT, option, value); } 721 void handlerByteLenGE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &byteLenGE, option, value); } 722 void handlerByteLenGT(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &byteLenGT, option, value); } 723 void handlerByteLenEQ(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &byteLenEQ, option, value); } 724 void handlerByteLenNE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &byteLenNE, option, value); } 725 726 void handlerFFLE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffLE, option, value); } 727 void handlerFFLT(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffLT, option, value); } 728 void handlerFFGE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffGE, option, value); } 729 void handlerFFGT(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffGT, option, value); } 730 void handlerFFEQ(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffEQ, option, value); } 731 void handlerFFNE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffNE, option, value); } 732 733 void handlerFFStrEQ(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffStrEQ, option, value); } 734 void handlerFFStrNE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffStrNE, option, value); } 735 void handlerFFIStrEQ(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffIStrEQ, option, value); } 736 void handlerFFIStrNE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffIStrNE, option, value); } 737 738 void handlerFFAbsDiffLE(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffAbsDiffLE, option, value); } 739 void handlerFFAbsDiffGT(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffAbsDiffGT, option, value); } 740 void handlerFFRelDiffLE(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffRelDiffLE, option, value); } 741 void handlerFFRelDiffGT(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffRelDiffGT, option, value); } 742 743 try 744 { 745 arraySep = ","; // Use comma to separate values in command line options 746 auto r = getoptInorder( 747 cmdArgs, 748 "help-verbose", " Print full help.", &helpVerbose, 749 "help-options", " Print the options list by itself.", &helpOptions, 750 std.getopt.config.caseSensitive, 751 "V|version", " Print version information and exit.", &versionWanted, 752 "H|header", " Treat the first line of each file as a header.", &hasHeader, 753 std.getopt.config.caseInsensitive, 754 "or", " Evaluate tests as an OR rather than an AND.", &disjunct, 755 std.getopt.config.caseSensitive, 756 "v|invert", " Invert the filter, printing lines that do not match.", &invert, 757 std.getopt.config.caseInsensitive, 758 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 759 760 "empty", "<field-list> True if FIELD is empty.", &handlerFldEmpty, 761 "not-empty", "<field-list> True if FIELD is not empty.", &handlerFldNotEmpty, 762 "blank", "<field-list> True if FIELD is empty or all whitespace.", &handlerFldBlank, 763 "not-blank", "<field-list> True if FIELD contains a non-whitespace character.", &handlerFldNotBlank, 764 765 "is-numeric", "<field-list> True if FIELD is interpretable as a number.", &handlerFldIsNumeric, 766 "is-finite", "<field-list> True if FIELD is interpretable as a number and is not NaN or infinity.", &handlerFldIsFinite, 767 "is-nan", "<field-list> True if FIELD is NaN.", &handlerFldIsNaN, 768 "is-infinity", "<field-list> True if FIELD is infinity.", &handlerFldIsInfinity, 769 770 "le", "<field-list>:NUM FIELD <= NUM (numeric).", &handlerNumLE, 771 "lt", "<field-list>:NUM FIELD < NUM (numeric).", &handlerNumLT, 772 "ge", "<field-list>:NUM FIELD >= NUM (numeric).", &handlerNumGE, 773 "gt", "<field-list>:NUM FIELD > NUM (numeric).", &handlerNumGT, 774 "eq", "<field-list>:NUM FIELD == NUM (numeric).", &handlerNumEQ, 775 "ne", "<field-list>:NUM FIELD != NUM (numeric).", &handlerNumNE, 776 777 "str-le", "<field-list>:STR FIELD <= STR (string).", &handlerStrLE, 778 "str-lt", "<field-list>:STR FIELD < STR (string).", &handlerStrLT, 779 "str-ge", "<field-list>:STR FIELD >= STR (string).", &handlerStrGE, 780 "str-gt", "<field-list>:STR FIELD > STR (string).", &handlerStrGT, 781 "str-eq", "<field-list>:STR FIELD == STR (string).", &handlerStrEQ, 782 "istr-eq", "<field-list>:STR FIELD == STR (string, case-insensitive).", &handlerIStrEQ, 783 "str-ne", "<field-list>:STR FIELD != STR (string).", &handlerStrNE, 784 "istr-ne", "<field-list>:STR FIELD != STR (string, case-insensitive).", &handlerIStrNE, 785 "str-in-fld", "<field-list>:STR FIELD contains STR (substring search).", &handlerStrInFld, 786 "istr-in-fld", "<field-list>:STR FIELD contains STR (substring search, case-insensitive).", &handlerIStrInFld, 787 "str-not-in-fld", "<field-list>:STR FIELD does not contain STR (substring search).", &handlerStrNotInFld, 788 "istr-not-in-fld", "<field-list>:STR FIELD does not contain STR (substring search, case-insensitive).", &handlerIStrNotInFld, 789 790 "regex", "<field-list>:REGEX FIELD matches regular expression.", &handlerRegexMatch, 791 "iregex", "<field-list>:REGEX FIELD matches regular expression, case-insensitive.", &handlerIRegexMatch, 792 "not-regex", "<field-list>:REGEX FIELD does not match regular expression.", &handlerRegexNotMatch, 793 "not-iregex", "<field-list>:REGEX FIELD does not match regular expression, case-insensitive.", &handlerIRegexNotMatch, 794 795 "char-len-le", "<field-list>:NUM character-length(FIELD) <= NUM.", &handlerCharLenLE, 796 "char-len-lt", "<field-list>:NUM character-length(FIELD) < NUM.", &handlerCharLenLT, 797 "char-len-ge", "<field-list>:NUM character-length(FIELD) >= NUM.", &handlerCharLenGE, 798 "char-len-gt", "<field-list>:NUM character-length(FIELD) > NUM.", &handlerCharLenGT, 799 "char-len-eq", "<field-list>:NUM character-length(FIELD) == NUM.", &handlerCharLenEQ, 800 "char-len-ne", "<field-list>:NUM character-length(FIELD) != NUM.", &handlerCharLenNE, 801 802 "byte-len-le", "<field-list>:NUM byte-length(FIELD) <= NUM.", &handlerByteLenLE, 803 "byte-len-lt", "<field-list>:NUM byte-length(FIELD) < NUM.", &handlerByteLenLT, 804 "byte-len-ge", "<field-list>:NUM byte-length(FIELD) >= NUM.", &handlerByteLenGE, 805 "byte-len-gt", "<field-list>:NUM byte-length(FIELD) > NUM.", &handlerByteLenGT, 806 "byte-len-eq", "<field-list>:NUM byte-length(FIELD) == NUM.", &handlerByteLenEQ, 807 "byte-len-ne", "<field-list>:NUM byte-length(FIELD) != NUM.", &handlerByteLenNE, 808 809 "ff-le", "FIELD1:FIELD2 FIELD1 <= FIELD2 (numeric).", &handlerFFLE, 810 "ff-lt", "FIELD1:FIELD2 FIELD1 < FIELD2 (numeric).", &handlerFFLT, 811 "ff-ge", "FIELD1:FIELD2 FIELD1 >= FIELD2 (numeric).", &handlerFFGE, 812 "ff-gt", "FIELD1:FIELD2 FIELD1 > FIELD2 (numeric).", &handlerFFGT, 813 "ff-eq", "FIELD1:FIELD2 FIELD1 == FIELD2 (numeric).", &handlerFFEQ, 814 "ff-ne", "FIELD1:FIELD2 FIELD1 != FIELD2 (numeric).", &handlerFFNE, 815 "ff-str-eq", "FIELD1:FIELD2 FIELD1 == FIELD2 (string).", &handlerFFStrEQ, 816 "ff-istr-eq", "FIELD1:FIELD2 FIELD1 == FIELD2 (string, case-insensitive).", &handlerFFIStrEQ, 817 "ff-str-ne", "FIELD1:FIELD2 FIELD1 != FIELD2 (string).", &handlerFFStrNE, 818 "ff-istr-ne", "FIELD1:FIELD2 FIELD1 != FIELD2 (string, case-insensitive).", &handlerFFIStrNE, 819 820 "ff-absdiff-le", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) <= NUM", &handlerFFAbsDiffLE, 821 "ff-absdiff-gt", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) > NUM", &handlerFFAbsDiffGT, 822 "ff-reldiff-le", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) / min(abs(FIELD1), abs(FIELD2)) <= NUM", &handlerFFRelDiffLE, 823 "ff-reldiff-gt", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) / min(abs(FIELD1), abs(FIELD2)) > NUM", &handlerFFRelDiffGT, 824 ); 825 826 /* Both help texts are a bit long. In this case, for "regular" help, don't 827 * print options, just the text. The text summarizes the options. 828 */ 829 if (r.helpWanted) 830 { 831 stdout.write(helpText); 832 return tuple(false, 0); 833 } 834 else if (helpVerbose) 835 { 836 defaultGetoptPrinter(helpTextVerbose, r.options); 837 return tuple(false, 0); 838 } 839 else if (helpOptions) 840 { 841 defaultGetoptPrinter(helpTextOptions, r.options); 842 return tuple(false, 0); 843 } 844 else if (versionWanted) 845 { 846 import tsv_utils.common.tsvutils_version; 847 writeln(tsvutilsVersionNotice("tsv-filter")); 848 return tuple(false, 0); 849 } 850 } 851 catch (Exception e) 852 { 853 stderr.writefln("[%s] Error processing command line arguments: %s", programName, e.msg); 854 return tuple(false, 1); 855 } 856 return tuple(true, 0); 857 } 858 } 859 860 /** tsvFilter processes the input files and runs the tests. 861 */ 862 void tsvFilter(const TsvFilterOptions cmdopt, const string[] inputFiles) 863 { 864 import std.algorithm : all, any, splitter; 865 import std.range; 866 import tsv_utils.common.utils : BufferedOutputRange, bufferedByLine, throwIfWindowsNewlineOnUnix; 867 868 /* BufferedOutputRange improves performance on narrow files with high percentages of 869 * writes. Want responsive output if output is rare, so ensure the first matched 870 * line is written, and that writes separated by long stretches of non-matched lines 871 * are written. 872 */ 873 enum maxInputLinesWithoutBufferFlush = 1024; 874 size_t inputLinesWithoutBufferFlush = maxInputLinesWithoutBufferFlush + 1; 875 876 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 877 878 /* Process each input file, one line at a time. */ 879 auto lineFields = new char[][](cmdopt.maxFieldIndex + 1); 880 bool headerWritten = false; 881 foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"]) 882 { 883 auto inputStream = (filename == "-") ? stdin : filename.File(); 884 foreach (lineNum, line; inputStream.bufferedByLine.enumerate(1)) 885 { 886 if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, lineNum); 887 if (lineNum == 1 && cmdopt.hasHeader) 888 { 889 /* Header. Output on the first file, skip subsequent files. */ 890 if (!headerWritten) 891 { 892 bufferedOutput.appendln(line); 893 headerWritten = true; 894 } 895 } 896 else 897 { 898 /* Copy the needed number of fields to the fields array. */ 899 int fieldIndex = -1; 900 foreach (fieldValue; line.splitter(cmdopt.delim)) 901 { 902 if (fieldIndex == cast(long) cmdopt.maxFieldIndex) break; 903 fieldIndex++; 904 lineFields[fieldIndex] = fieldValue; 905 } 906 907 if (fieldIndex == -1) 908 { 909 assert(line.length == 0); 910 /* Bug work-around. Currently empty lines are not handled properly by splitter. 911 * Bug: https://issues.dlang.org/show_bug.cgi?id=15735 912 * Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030 913 * Work-around: Point to the line. It's an empty string. 914 */ 915 fieldIndex++; 916 lineFields[fieldIndex] = line; 917 } 918 919 if (fieldIndex < cast(long) cmdopt.maxFieldIndex) 920 { 921 throw new Exception( 922 format("Not enough fields in line. File: %s, Line: %s", 923 (filename == "-") ? "Standard Input" : filename, lineNum)); 924 } 925 926 /* Run the tests. Tests will fail (throw) if a field cannot be converted 927 * to the expected type. 928 */ 929 try 930 { 931 inputLinesWithoutBufferFlush++; 932 bool passed = cmdopt.disjunct ? 933 cmdopt.tests.any!(x => x(lineFields)) : 934 cmdopt.tests.all!(x => x(lineFields)); 935 if (cmdopt.invert) passed = !passed; 936 if (passed) 937 { 938 const bool wasFlushed = bufferedOutput.appendln(line); 939 if (wasFlushed) inputLinesWithoutBufferFlush = 0; 940 else if (inputLinesWithoutBufferFlush > maxInputLinesWithoutBufferFlush) 941 { 942 bufferedOutput.flush; 943 inputLinesWithoutBufferFlush = 0; 944 } 945 } 946 } 947 catch (Exception e) 948 { 949 throw new Exception( 950 format("Could not process line or field: %s\n File: %s Line: %s%s", 951 e.msg, (filename == "-") ? "Standard Input" : filename, lineNum, 952 (lineNum == 1) ? "\n Is this a header line? Use --header to skip." : "")); 953 } 954 } 955 } 956 } 957 }