1 /** 2 Command line tool that filters TSV files. 3 4 This tool filters tab-delimited files based on numeric or string comparisons 5 against specific fields. See the helpText string for details. 6 7 Copyright (c) 2015-2020, eBay Inc. 8 Initially written by Jon Degenhardt 9 10 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt) 11 */ 12 module tsv_utils.tsv_filter; 13 14 import std.algorithm : canFind, equal, findSplit, max, min; 15 import std.conv : to; 16 import std.exception : enforce; 17 import std.format : format; 18 import std.math : abs, isFinite, isInfinity, isNaN; 19 import std.range; 20 import std.regex; 21 import std.stdio; 22 import std.string : isNumeric; 23 import std.typecons; 24 import std.uni: asLowerCase, toLower, byGrapheme; 25 26 /* The program has two main parts, command line arg processing and processing the input 27 * files. Much of the work is in command line arg processing. This sets up the tests run 28 * against each input line. The tests are an array of delegates (closures) run against the 29 * fields in the line. The tests are based on command line arguments, of which there is 30 * a lengthy set, one for each test. 31 */ 32 33 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 34 35 /** Main program. Invokes command line arg processing and tsv-filter to perform 36 * the real work. Any errors are caught and reported. 37 */ 38 int main(string[] cmdArgs) 39 { 40 /* When running in DMD code coverage mode, turn on report merging. */ 41 version(D_Coverage) version(DigitalMars) 42 { 43 import core.runtime : dmd_coverSetMerge; 44 dmd_coverSetMerge(true); 45 } 46 47 TsvFilterOptions cmdopt; 48 const r = cmdopt.processArgs(cmdArgs); 49 if (!r[0]) return r[1]; 50 version(LDC_Profile) 51 { 52 import ldc.profile : resetAll; 53 resetAll(); 54 } 55 try tsvFilter(cmdopt); 56 catch (Exception e) 57 { 58 stderr.writefln("Error [%s]: %s", cmdopt.programName, e.msg); 59 return 1; 60 } 61 return 0; 62 } 63 64 immutable helpText = q"EOS 65 Synopsis: tsv-filter [options] [file...] 66 67 Filter tab-delimited files for matching lines via comparison tests against 68 individual fields. Use '--help-verbose' for a more detailed description. 69 70 Global options: 71 --help-verbose Print full help. 72 --help-options Print the options list by itself. 73 --V|version Print version information and exit. 74 --H|header Treat the first line of each file as a header. 75 --or Evaluate tests as an OR rather than an AND clause. 76 --v|invert Invert the filter, printing lines that do not match. 77 --d|delimiter CHR Field delimiter. Default: TAB. 78 79 Operators: 80 * Test if a field is empty (no characters) or blank (empty or whitespace only). 81 Syntax: --empty|not-empty|blank|not-blank FIELD 82 Example: --empty 5 // True if field 5 is empty 83 84 * Test if a field is numeric, finite, NaN, or infinity 85 Syntax: --is-numeric|is-finite|is-nan|is-infinity FIELD 86 Example: --is-numeric 5 --gt 5:100 // Ensure field 5 is numeric before --gt test. 87 88 * Compare a field to a number (integer or float) 89 Syntax: --eq|ne|lt|le|gt|ge FIELD:NUM 90 Example: --lt 5:1000 --gt 2:0.5 // True if (field 5 < 1000) and (field 2 > 0.5) 91 92 * Compare a field to a string 93 Syntax: --str-eq|str-ne FIELD:STR 94 Example: --str-eq 3:abc // True if field 3 is "abc" 95 96 * Test if a field contains a string (substring search) 97 Syntax: --str-in-fld|str-not-in-fld|istr-in-fld|istr-not-in-fld FIELD:STR 98 Example: --str-in-fld 1:hello // True if field 1 contains "hello" 99 100 * Test if a field matches a regular expression. 101 Syntax: --regex|iregex|not-regex|not-iregex FIELD:REGEX 102 Example: --regex '3:ab*c' // True if field 3 contains "ac", "abc", "abbc", etc. 103 104 * Test a field's character or byte length 105 Syntax: --char-len-[le|lt|ge|gt|eq|ne] FIELD:NUM 106 --byte-len-[le|lt|ge|gt|eq|ne] FIELD:NUM 107 Example: --char-len-lt 2:10 // True if field 2 is less than 10 characters long. 108 --byte-len-gt 2:10 // True if field 2 is greater than 10 bytes long. 109 110 * Field to field comparisons - Similar to field vs literal comparisons, but field vs field. 111 Syntax: --ff-eq|ff-ne|ff-lt|ff-le|ff-gt|ff-ge FIELD1:FIELD2 112 --ff-str-eq|ff-str-ne|ff-istr-eq|ff-istr-ne FIELD1:FIELD2 113 Example: --ff-eq 2:4 // True if fields 2 and 4 are numerically equivalent 114 --ff-str-eq 2:4 // True if fields 2 and 4 are the same strings 115 116 * Field to field difference comparisons - Absolute and relative difference 117 Syntax: --ff-absdiff-le|ff-absdiff-gt FIELD1:FIELD2:NUM 118 --ff-reldiff-le|ff-reldiff-gt FIELD1:FIELD2:NUM 119 Example: --ff-absdiff-lt 1:3:0.25 // True if abs(field1 - field2) < 0.25 120 121 EOS"; 122 123 immutable helpTextVerbose = q"EOS 124 Synopsis: tsv-filter [options] [file...] 125 126 Filter lines of tab-delimited files via comparison tests against fields. Multiple 127 tests can be specified, by default they are evaluated as AND clause. Lines 128 satisfying the tests are written to standard output. 129 130 Typical test syntax is '--op field:value', where 'op' is an operator, 'field' is a 131 1-based field index, and 'value' is the comparison basis. For example, '--lt 3:500' 132 tests if field 3 is less than 500. A more complete example: 133 134 tsv-filter --header --gt 1:50 --lt 1:100 --le 2:1000 data.tsv 135 136 This outputs all lines from file data.tsv where field 1 is greater than 50 and less 137 than 100, and field 2 is less than or equal to 1000. The header is also output. 138 139 Field lists can be used to specify multiple fields at once. For example: 140 141 tsv-filter --not-blank 1-10 --str-ne 1,2,5:'--' data.tsv 142 143 tests that fields 1-10 are not blank and fields 1,2,5 are not "--". 144 145 Tests available include: 146 * Test if a field is empty (no characters) or blank (empty or whitespace only). 147 * Test if a field is interpretable as a number, a finite number, NaN, or Infinity. 148 * Compare a field to a number - Numeric equality and relational tests. 149 * Compare a field to a string - String equality and relational tests. 150 * Test if a field matches a regular expression. Case sensitive or insensitive. 151 * Test if a field contains a string. Sub-string search, case sensitive or insensitive. 152 * Test a field's character or byte length. 153 * Field to field comparisons - Similar to the other tests, except comparing 154 one field to another in the same line. 155 156 Details: 157 * The run is aborted if there are not enough fields in an input line. 158 * Numeric tests will fail and abort the run if a field cannot be interpreted as a 159 number. This includes fields with no text. To avoid this use '--is-numeric' or 160 '--is-finite' prior to the numeric test. For example, '--is-numeric 5 --gt 5:100' 161 ensures field 5 is numeric before running the --gt test. 162 * Regular expression syntax is defined by the D programming language. They follow 163 common conventions (perl, python, etc.). Most common forms work as expected. 164 165 Options: 166 EOS"; 167 168 immutable helpTextOptions = q"EOS 169 Synopsis: tsv-filter [options] [file...] 170 171 Options: 172 EOS"; 173 174 /* The next blocks of code define the structure of the boolean tests run against input lines. 175 * This includes function and delegate (closure) signatures, creation mechanisms, option 176 * handlers, etc. Command line arg processing to build the test structure. 177 */ 178 179 /* FieldsPredicate delegate signature - Each input line is run against a set of boolean 180 * tests. Each test is a 'FieldsPredicate'. A FieldsPredicate is a delegate (closure) 181 * containing all info about the test except the field values of the line being tested. 182 * These delegates are created as part of command line arg processing. The wrapped data 183 * includes operation, field indexes, literal values, etc. At run-time the delegate is 184 * passed one argument, the split input line. 185 */ 186 alias FieldsPredicate = bool delegate(const char[][] fields); 187 188 /* FieldsPredicate function signatures - These aliases represent the different function 189 * signatures used in FieldsPredicate delegates. Each alias has a corresponding 'make' 190 * function. The 'make' function takes a real predicate function and closure args and 191 * returns a FieldsPredicate delegate. Predicates types are: 192 * 193 * - FieldUnaryPredicate - Test based on a single field. (e.g. --empty 4) 194 * - FieldVsNumberPredicate - Test based on a field index (used to get the field value) 195 * and a fixed numeric value. For example, field 2 less than 100 (--lt 2:100). 196 * - FieldVsStringPredicate - Test based on a field and a string. (e.g. --str-eq 2:abc) 197 * - FieldVsIStringPredicate - Case-insensitive test based on a field and a string. 198 * (e.g. --istr-eq 2:abc) 199 * - FieldVsRegexPredicate - Test based on a field and a regex. (e.g. --regex '2:ab*c') 200 * - FieldVsFieldPredicate - Test based on two fields. (e.g. --ff-le 2:4). 201 * 202 * An actual FieldsPredicate takes the fields from the line and the closure args and 203 * runs the test. For example, a function testing if a field is less than a specific 204 * value would pull the specified field from the fields array, convert the string to 205 * a number, then run the less-than test. 206 */ 207 alias FieldUnaryPredicate = bool function(const char[][] fields, size_t index); 208 alias FieldVsNumberPredicate = bool function(const char[][] fields, size_t index, double value); 209 alias FieldVsStringPredicate = bool function(const char[][] fields, size_t index, string value); 210 alias FieldVsIStringPredicate = bool function(const char[][] fields, size_t index, dstring value); 211 alias FieldVsRegexPredicate = bool function(const char[][] fields, size_t index, Regex!char value); 212 alias FieldVsFieldPredicate = bool function(const char[][] fields, size_t index1, size_t index2); 213 alias FieldFieldNumPredicate = bool function(const char[][] fields, size_t index1, size_t index2, double value); 214 215 FieldsPredicate makeFieldUnaryDelegate(FieldUnaryPredicate fn, size_t index) 216 { 217 return fields => fn(fields, index); 218 } 219 220 FieldsPredicate makeFieldVsNumberDelegate(FieldVsNumberPredicate fn, size_t index, double value) 221 { 222 return fields => fn(fields, index, value); 223 } 224 225 FieldsPredicate makeFieldVsStringDelegate(FieldVsStringPredicate fn, size_t index, string value) 226 { 227 return fields => fn(fields, index, value); 228 } 229 230 FieldsPredicate makeFieldVsIStringDelegate(FieldVsIStringPredicate fn, size_t index, dstring value) 231 { 232 return fields => fn(fields, index, value); 233 } 234 235 FieldsPredicate makeFieldVsRegexDelegate(FieldVsRegexPredicate fn, size_t index, Regex!char value) 236 { 237 return fields => fn(fields, index, value); 238 } 239 240 FieldsPredicate makeFieldVsFieldDelegate(FieldVsFieldPredicate fn, size_t index1, size_t index2) 241 { 242 return fields => fn(fields, index1, index2); 243 } 244 245 FieldsPredicate makeFieldFieldNumDelegate(FieldFieldNumPredicate fn, size_t index1, size_t index2, double value) 246 { 247 return fields => fn(fields, index1, index2, value); 248 } 249 250 /* Predicate functions - These are the actual functions used in a FieldsPredicate. They 251 * are a direct reflection of the operators available via command line args. Each matches 252 * one of the FieldsPredicate function aliases defined above. 253 */ 254 bool fldEmpty(const char[][] fields, size_t index) { return fields[index].length == 0; } 255 bool fldNotEmpty(const char[][] fields, size_t index) { return fields[index].length != 0; } 256 bool fldBlank(const char[][] fields, size_t index) { return cast(bool) fields[index].matchFirst(ctRegex!`^\s*$`); } 257 bool fldNotBlank(const char[][] fields, size_t index) { return !fields[index].matchFirst(ctRegex!`^\s*$`); } 258 259 bool fldIsNumeric(const char[][] fields, size_t index) { return fields[index].isNumeric; } 260 bool fldIsFinite(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isFinite; } 261 bool fldIsNaN(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isNaN; } 262 bool fldIsInfinity(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isInfinity; } 263 264 bool numLE(const char[][] fields, size_t index, double val) { return fields[index].to!double <= val; } 265 bool numLT(const char[][] fields, size_t index, double val) { return fields[index].to!double < val; } 266 bool numGE(const char[][] fields, size_t index, double val) { return fields[index].to!double >= val; } 267 bool numGT(const char[][] fields, size_t index, double val) { return fields[index].to!double > val; } 268 bool numEQ(const char[][] fields, size_t index, double val) { return fields[index].to!double == val; } 269 bool numNE(const char[][] fields, size_t index, double val) { return fields[index].to!double != val; } 270 271 bool strLE(const char[][] fields, size_t index, string val) { return fields[index] <= val; } 272 bool strLT(const char[][] fields, size_t index, string val) { return fields[index] < val; } 273 bool strGE(const char[][] fields, size_t index, string val) { return fields[index] >= val; } 274 bool strGT(const char[][] fields, size_t index, string val) { return fields[index] > val; } 275 bool strEQ(const char[][] fields, size_t index, string val) { return fields[index] == val; } 276 bool strNE(const char[][] fields, size_t index, string val) { return fields[index] != val; } 277 bool strInFld(const char[][] fields, size_t index, string val) { return fields[index].canFind(val); } 278 bool strNotInFld(const char[][] fields, size_t index, string val) { return !fields[index].canFind(val); } 279 280 /* Note: For istr predicates, the command line value has been lower-cased by fieldVsIStringOptionHander. 281 */ 282 bool istrEQ(const char[][] fields, size_t index, dstring val) { return fields[index].asLowerCase.equal(val); } 283 bool istrNE(const char[][] fields, size_t index, dstring val) { return !fields[index].asLowerCase.equal(val); } 284 bool istrInFld(const char[][] fields, size_t index, dstring val) { return fields[index].asLowerCase.canFind(val); } 285 bool istrNotInFld(const char[][] fields, size_t index, dstring val) { return !fields[index].asLowerCase.canFind(val); } 286 287 /* Note: Case-sensitivity is built into the regex value, so these regex predicates are 288 * used for both case-sensitive and case-insensitive regex operators. 289 */ 290 bool regexMatch(const char[][] fields, size_t index, Regex!char val) { return cast(bool) fields[index].matchFirst(val); } 291 bool regexNotMatch(const char[][] fields, size_t index, Regex!char val) { return !fields[index].matchFirst(val); } 292 293 bool charLenLE(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength <= val; } 294 bool charLenLT(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength < val; } 295 bool charLenGE(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength >= val; } 296 bool charLenGT(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength > val; } 297 bool charLenEQ(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength == val; } 298 bool charLenNE(const char[][] fields, size_t index, double val) { return fields[index].byGrapheme.walkLength != val; } 299 300 bool byteLenLE(const char[][] fields, size_t index, double val) { return fields[index].length <= val; } 301 bool byteLenLT(const char[][] fields, size_t index, double val) { return fields[index].length < val; } 302 bool byteLenGE(const char[][] fields, size_t index, double val) { return fields[index].length >= val; } 303 bool byteLenGT(const char[][] fields, size_t index, double val) { return fields[index].length > val; } 304 bool byteLenEQ(const char[][] fields, size_t index, double val) { return fields[index].length == val; } 305 bool byteLenNE(const char[][] fields, size_t index, double val) { return fields[index].length != val; } 306 307 bool ffLE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double <= fields[index2].to!double; } 308 bool ffLT(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double < fields[index2].to!double; } 309 bool ffGE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double >= fields[index2].to!double; } 310 bool ffGT(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double > fields[index2].to!double; } 311 bool ffEQ(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double == fields[index2].to!double; } 312 bool ffNE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double != fields[index2].to!double; } 313 bool ffStrEQ(const char[][] fields, size_t index1, size_t index2) { return fields[index1] == fields[index2]; } 314 bool ffStrNE(const char[][] fields, size_t index1, size_t index2) { return fields[index1] != fields[index2]; } 315 bool ffIStrEQ(const char[][] fields, size_t index1, size_t index2) 316 { 317 return equal(fields[index1].asLowerCase, fields[index2].asLowerCase); 318 } 319 bool ffIStrNE(const char[][] fields, size_t index1, size_t index2) 320 { 321 return !equal(fields[index1].asLowerCase, fields[index2].asLowerCase); 322 } 323 324 auto AbsDiff(double v1, double v2) { return (v1 - v2).abs; } 325 auto RelDiff(double v1, double v2) { return (v1 - v2).abs / min(v1.abs, v2.abs); } 326 327 bool ffAbsDiffLE(const char[][] fields, size_t index1, size_t index2, double value) 328 { 329 return AbsDiff(fields[index1].to!double, fields[index2].to!double) <= value; 330 } 331 bool ffAbsDiffGT(const char[][] fields, size_t index1, size_t index2, double value) 332 { 333 return AbsDiff(fields[index1].to!double, fields[index2].to!double) > value; 334 } 335 bool ffRelDiffLE(const char[][] fields, size_t index1, size_t index2, double value) 336 { 337 return RelDiff(fields[index1].to!double, fields[index2].to!double) <= value; 338 } 339 bool ffRelDiffGT(const char[][] fields, size_t index1, size_t index2, double value) 340 { 341 return RelDiff(fields[index1].to!double, fields[index2].to!double) > value; 342 } 343 344 /* Command line option handlers - There is a command line option handler for each 345 * predicate type. That is, one each for FieldUnaryPredicate, FieldVsNumberPredicate, 346 * etc. Option handlers are passed the tests array, the predicate function, and the 347 * command line option arguments. A FieldsPredicate delegate is created and appended to 348 * the tests array. An exception is thrown if errors are detected while processing the 349 * option, the error text is intended for the end user. 350 * 351 * These option handlers have similar functionality, differing in option processing and 352 * error message generation. fieldVsNumberOptionHandler is described as an example. It 353 * handles command options such as '--lt 3:1000', which tests field 3 for a values less 354 * than 1000. It is passed the tests array, the 'numLE' function to use for the test, and 355 * the string "3:1000" representing the option value. It parses the option value into 356 * field index (unsigned int) and value (double). These are wrapped in a FieldsPredicate 357 * which is added to the tests array. An error is signaled if the option string is invalid. 358 * 359 * During processing, fields indexes are converted from one-based to zero-based. As an 360 * optimization, the maximum field index is also tracked. This allows early termination of 361 * line splitting. 362 */ 363 void fieldUnaryOptionHandler( 364 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldUnaryPredicate fn, string option, string optionVal) 365 { 366 import std.range : enumerate; 367 import std.typecons : Yes, No; 368 import tsv_utils.common.utils : parseFieldList; 369 370 try foreach (fieldNum, fieldIndex; 371 optionVal.parseFieldList!(size_t, Yes.convertToZeroBasedIndex).enumerate(1)) 372 { 373 tests ~= makeFieldUnaryDelegate(fn, fieldIndex); 374 maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex; 375 } 376 catch (Exception e) 377 { 378 import std.format : format; 379 e.msg = format("[--%s %s]. %s\n Expected: '--%s <field>' or '--%s <field-list>'.", 380 option, optionVal, e.msg, option, option); 381 throw e; 382 } 383 } 384 385 void fieldVsNumberOptionHandler( 386 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsNumberPredicate fn, string option, string optionVal) 387 { 388 import std.range : enumerate; 389 import std.typecons : Yes, No; 390 import tsv_utils.common.utils : parseFieldList; 391 392 auto formatErrorMsg(string option, string optionVal, string errorMessage="") 393 { 394 import std.format; 395 396 string optionalSpace = (errorMessage.length == 0) ? "" : " "; 397 return format( 398 "Invalid option: '--%s %s'.%s%s\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val> where <val> is a number.", 399 option, optionVal, optionalSpace, errorMessage, option, option); 400 } 401 402 immutable valSplit = findSplit(optionVal, ":"); 403 404 enforce(valSplit[1].length != 0 && valSplit[2].length != 0, 405 formatErrorMsg(option, optionVal)); 406 407 double value; 408 try value = valSplit[2].to!double; 409 catch (Exception e) 410 { 411 throw new Exception(formatErrorMsg(option, optionVal, e.msg)); 412 } 413 414 try foreach (fieldNum, fieldIndex; 415 valSplit[0].parseFieldList!(size_t, Yes.convertToZeroBasedIndex).enumerate(1)) 416 { 417 tests ~= makeFieldVsNumberDelegate(fn, fieldIndex, value); 418 maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex; 419 } 420 catch (Exception e) 421 { 422 import std.format : format; 423 e.msg = format( 424 "[--%s %s]. %s\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val> where <val> is a number.", 425 option, optionVal, e.msg, option, option); 426 throw e; 427 } 428 } 429 430 void fieldVsStringOptionHandler( 431 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsStringPredicate fn, string option, string optionVal) 432 { 433 import std.range : enumerate; 434 import std.typecons : Yes, No; 435 import tsv_utils.common.utils : parseFieldList; 436 437 immutable valSplit = findSplit(optionVal, ":"); 438 439 enforce(valSplit[1].length != 0 && valSplit[2].length != 0, 440 format("Invalid option: '--%s %s'.\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val>' where <val> is a string.", 441 option, optionVal, option, option)); 442 443 string value = valSplit[2].to!string; 444 445 try foreach (fieldNum, fieldIndex; 446 valSplit[0].parseFieldList!(size_t, Yes.convertToZeroBasedIndex).enumerate(1)) 447 { 448 tests ~= makeFieldVsStringDelegate(fn, fieldIndex, value); 449 maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex; 450 } 451 catch (Exception e) 452 { 453 import std.format : format; 454 e.msg = format( 455 "[--%s %s]. %s\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val>' where <val> is a string.", 456 option, optionVal, e.msg, option, option); 457 throw e; 458 } 459 } 460 461 /* The fieldVsIStringOptionHandler lower-cases the command line argument, assuming the 462 * case-insensitive comparison will be done on lower-cased values. 463 */ 464 void fieldVsIStringOptionHandler( 465 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsIStringPredicate fn, string option, string optionVal) 466 { 467 import std.range : enumerate; 468 import std.typecons : Yes, No; 469 import tsv_utils.common.utils : parseFieldList; 470 471 immutable valSplit = findSplit(optionVal, ":"); 472 473 enforce(valSplit[1].length != 0 && valSplit[2].length != 0, 474 format("Invalid option: '--%s %s'.\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val>' where <val> is a string.", 475 option, optionVal, option, option)); 476 477 string value = valSplit[2].to!string; 478 479 try foreach (fieldNum, fieldIndex; 480 valSplit[0].parseFieldList!(size_t, Yes.convertToZeroBasedIndex).enumerate(1)) 481 { 482 tests ~= makeFieldVsIStringDelegate(fn, fieldIndex, value.to!dstring.toLower); 483 maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex; 484 } 485 catch (Exception e) 486 { 487 import std.format : format; 488 e.msg = format( 489 "[--%s %s]. %s\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val>' where <val> is a string.", 490 option, optionVal, e.msg, option, option); 491 throw e; 492 } 493 } 494 495 void fieldVsRegexOptionHandler( 496 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsRegexPredicate fn, string option, string optionVal, 497 bool caseSensitive) 498 { 499 import std.range : enumerate; 500 import std.typecons : Yes, No; 501 import tsv_utils.common.utils : parseFieldList; 502 503 immutable valSplit = findSplit(optionVal, ":"); 504 505 enforce(valSplit[1].length != 0 && valSplit[2].length != 0, 506 format("Invalid option: '--%s %s'.\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val>' where <val> is a regular expression.", 507 option, optionVal, option, option)); 508 509 Regex!char value; 510 try 511 { 512 immutable modifiers = caseSensitive ? "" : "i"; 513 value = regex(valSplit[2], modifiers); 514 } 515 catch (Exception e) 516 { 517 throw new Exception( 518 format("Invalid regular expression: '--%s %s'. %s\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val>' where <val> is a regular expression.", 519 option, optionVal, e.msg, option, option)); 520 } 521 522 try foreach (fieldNum, fieldIndex; 523 valSplit[0].parseFieldList!(size_t, Yes.convertToZeroBasedIndex).enumerate(1)) 524 { 525 tests ~= makeFieldVsRegexDelegate(fn, fieldIndex, value); 526 maxFieldIndex = (fieldIndex > maxFieldIndex) ? fieldIndex : maxFieldIndex; 527 } 528 catch (Exception e) 529 { 530 import std.format : format; 531 e.msg = format( 532 "[--%s %s]. %s\n Expected: '--%s <field>:<val>' or '--%s <field-list>:<val>' where <val> is a regular expression.", 533 option, optionVal, e.msg, option, option); 534 throw e; 535 } 536 } 537 538 void fieldVsFieldOptionHandler( 539 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsFieldPredicate fn, string option, string optionVal) 540 { 541 immutable valSplit = findSplit(optionVal, ":"); 542 543 enforce(valSplit[1].length != 0 && valSplit[2].length != 0, 544 format("Invalid option: '--%s %s'. Expected: '--%s <field1>:<field2>' where fields are 1-upped integers.", 545 option, optionVal, option)); 546 547 size_t field1; 548 size_t field2; 549 try 550 { 551 field1 = valSplit[0].to!size_t; 552 field2 = valSplit[2].to!size_t; 553 } 554 catch (Exception e) 555 { 556 throw new Exception( 557 format("Invalid values in option: '--%s %s'. Expected: '--%s <field1>:<field2>' where fields are 1-upped integers.", 558 option, optionVal, option)); 559 } 560 561 enforce(field1 != 0 && field2 != 0, 562 format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal)); 563 564 enforce(field1 != field2, 565 format("Invalid option: '--%s %s'. Field1 and field2 must be different fields", option, optionVal)); 566 567 immutable size_t zeroBasedIndex1 = field1 - 1; 568 immutable size_t zeroBasedIndex2 = field2 - 1; 569 tests ~= makeFieldVsFieldDelegate(fn, zeroBasedIndex1, zeroBasedIndex2); 570 maxFieldIndex = max(maxFieldIndex, zeroBasedIndex1, zeroBasedIndex2); 571 } 572 573 574 void fieldFieldNumOptionHandler( 575 ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldFieldNumPredicate fn, string option, string optionVal) 576 { 577 size_t field1; 578 size_t field2; 579 double value; 580 immutable valSplit = findSplit(optionVal, ":"); 581 auto isValidOption = (valSplit[1].length != 0 && valSplit[2].length != 0); 582 583 if (isValidOption) 584 { 585 immutable valSplit2 = findSplit(valSplit[2], ":"); 586 isValidOption = (valSplit2[1].length != 0 && valSplit2[2].length != 0); 587 588 if (isValidOption) 589 { 590 try 591 { 592 field1 = valSplit[0].to!size_t; 593 field2 = valSplit2[0].to!size_t; 594 value = valSplit2[2].to!double; 595 } 596 catch (Exception e) 597 { 598 isValidOption = false; 599 } 600 } 601 } 602 603 enforce(isValidOption, 604 format("Invalid values in option: '--%s %s'. Expected: '--%s <field1>:<field2>:<num>' where fields are 1-upped integers.", 605 option, optionVal, option)); 606 607 enforce(field1 != 0 && field2 != 0, 608 format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal)); 609 610 enforce(field1 != field2, 611 format("Invalid option: '--%s %s'. Field1 and field2 must be different fields", option, optionVal)); 612 613 immutable size_t zeroBasedIndex1 = field1 - 1; 614 immutable size_t zeroBasedIndex2 = field2 - 1; 615 tests ~= makeFieldFieldNumDelegate(fn, zeroBasedIndex1, zeroBasedIndex2, value); 616 maxFieldIndex = max(maxFieldIndex, zeroBasedIndex1, zeroBasedIndex2); 617 } 618 619 /** Command line options - This struct holds the results of command line option processing. 620 * It also has a method, processArgs, that invokes command line arg processing. 621 */ 622 struct TsvFilterOptions 623 { 624 import tsv_utils.common.utils : inputSourceRange, InputSourceRange, ReadHeader; 625 626 string programName; 627 InputSourceRange inputSources; // Input files 628 FieldsPredicate[] tests; // Derived from tests 629 size_t maxFieldIndex; // Derived from tests 630 bool hasHeader = false; // --H|header 631 bool invert = false; // --invert 632 bool disjunct = false; // --or 633 char delim = '\t'; // --delimiter 634 bool helpVerbose = false; // --help-verbose 635 bool helpOptions = false; // --help-options 636 bool versionWanted = false; // --V|version 637 638 /* Returns a tuple. First value is true if command line arguments were successfully 639 * processed and execution should continue, or false if an error occurred or the user 640 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 641 * 642 * Returning true (execution continues) means args have been validated and the 643 * tests array has been established. 644 */ 645 auto processArgs (ref string[] cmdArgs) 646 { 647 import std.getopt; 648 import std.path : baseName, stripExtension; 649 import tsv_utils.common.getopt_inorder; 650 651 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 652 653 /* Command option handlers - One handler for each option. These conform to the 654 * getopt required handler signature, and separate knowledge the specific command 655 * option text from the option processing. 656 */ 657 void handlerFldEmpty(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldEmpty, option, value); } 658 void handlerFldNotEmpty(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldNotEmpty, option, value); } 659 void handlerFldBlank(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldBlank, option, value); } 660 void handlerFldNotBlank(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldNotBlank, option, value); } 661 662 void handlerFldIsNumeric(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsNumeric, option, value); } 663 void handlerFldIsFinite(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsFinite, option, value); } 664 void handlerFldIsNaN(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsNaN, option, value); } 665 void handlerFldIsInfinity(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsInfinity, option, value); } 666 667 void handlerNumLE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numLE, option, value); } 668 void handlerNumLT(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numLT, option, value); } 669 void handlerNumGE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numGE, option, value); } 670 void handlerNumGT(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numGT, option, value); } 671 void handlerNumEQ(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numEQ, option, value); } 672 void handlerNumNE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numNE, option, value); } 673 674 void handlerStrLE(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strLE, option, value); } 675 void handlerStrLT(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strLT, option, value); } 676 void handlerStrGE(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strGE, option, value); } 677 void handlerStrGT(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strGT, option, value); } 678 void handlerStrEQ(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strEQ, option, value); } 679 void handlerStrNE(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strNE, option, value); } 680 681 void handlerStrInFld(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strInFld, option, value); } 682 void handlerStrNotInFld(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strNotInFld, option, value); } 683 684 void handlerIStrEQ(string option, string value) { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrEQ, option, value); } 685 void handlerIStrNE(string option, string value) { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrNE, option, value); } 686 void handlerIStrInFld(string option, string value) { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrInFld, option, value); } 687 void handlerIStrNotInFld(string option, string value) { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrNotInFld, option, value); } 688 689 void handlerRegexMatch(string option, string value) { fieldVsRegexOptionHandler(tests, maxFieldIndex, ®exMatch, option, value, true); } 690 void handlerRegexNotMatch(string option, string value) { fieldVsRegexOptionHandler(tests, maxFieldIndex, ®exNotMatch, option, value, true); } 691 void handlerIRegexMatch(string option, string value) { fieldVsRegexOptionHandler(tests, maxFieldIndex, ®exMatch, option, value, false); } 692 void handlerIRegexNotMatch(string option, string value) { fieldVsRegexOptionHandler(tests, maxFieldIndex, ®exNotMatch, option, value, false); } 693 694 void handlerCharLenLE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &charLenLE, option, value); } 695 void handlerCharLenLT(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &charLenLT, option, value); } 696 void handlerCharLenGE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &charLenGE, option, value); } 697 void handlerCharLenGT(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &charLenGT, option, value); } 698 void handlerCharLenEQ(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &charLenEQ, option, value); } 699 void handlerCharLenNE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &charLenNE, option, value); } 700 701 void handlerByteLenLE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &byteLenLE, option, value); } 702 void handlerByteLenLT(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &byteLenLT, option, value); } 703 void handlerByteLenGE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &byteLenGE, option, value); } 704 void handlerByteLenGT(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &byteLenGT, option, value); } 705 void handlerByteLenEQ(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &byteLenEQ, option, value); } 706 void handlerByteLenNE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &byteLenNE, option, value); } 707 708 void handlerFFLE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffLE, option, value); } 709 void handlerFFLT(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffLT, option, value); } 710 void handlerFFGE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffGE, option, value); } 711 void handlerFFGT(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffGT, option, value); } 712 void handlerFFEQ(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffEQ, option, value); } 713 void handlerFFNE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffNE, option, value); } 714 715 void handlerFFStrEQ(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffStrEQ, option, value); } 716 void handlerFFStrNE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffStrNE, option, value); } 717 void handlerFFIStrEQ(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffIStrEQ, option, value); } 718 void handlerFFIStrNE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffIStrNE, option, value); } 719 720 void handlerFFAbsDiffLE(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffAbsDiffLE, option, value); } 721 void handlerFFAbsDiffGT(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffAbsDiffGT, option, value); } 722 void handlerFFRelDiffLE(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffRelDiffLE, option, value); } 723 void handlerFFRelDiffGT(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffRelDiffGT, option, value); } 724 725 try 726 { 727 arraySep = ","; // Use comma to separate values in command line options 728 auto r = getoptInorder( 729 cmdArgs, 730 "help-verbose", " Print full help.", &helpVerbose, 731 "help-options", " Print the options list by itself.", &helpOptions, 732 std.getopt.config.caseSensitive, 733 "V|version", " Print version information and exit.", &versionWanted, 734 "H|header", " Treat the first line of each file as a header.", &hasHeader, 735 std.getopt.config.caseInsensitive, 736 "or", " Evaluate tests as an OR rather than an AND.", &disjunct, 737 std.getopt.config.caseSensitive, 738 "v|invert", " Invert the filter, printing lines that do not match.", &invert, 739 std.getopt.config.caseInsensitive, 740 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim, 741 742 "empty", "<field-list> True if FIELD is empty.", &handlerFldEmpty, 743 "not-empty", "<field-list> True if FIELD is not empty.", &handlerFldNotEmpty, 744 "blank", "<field-list> True if FIELD is empty or all whitespace.", &handlerFldBlank, 745 "not-blank", "<field-list> True if FIELD contains a non-whitespace character.", &handlerFldNotBlank, 746 747 "is-numeric", "<field-list> True if FIELD is interpretable as a number.", &handlerFldIsNumeric, 748 "is-finite", "<field-list> True if FIELD is interpretable as a number and is not NaN or infinity.", &handlerFldIsFinite, 749 "is-nan", "<field-list> True if FIELD is NaN.", &handlerFldIsNaN, 750 "is-infinity", "<field-list> True if FIELD is infinity.", &handlerFldIsInfinity, 751 752 "le", "<field-list>:NUM FIELD <= NUM (numeric).", &handlerNumLE, 753 "lt", "<field-list>:NUM FIELD < NUM (numeric).", &handlerNumLT, 754 "ge", "<field-list>:NUM FIELD >= NUM (numeric).", &handlerNumGE, 755 "gt", "<field-list>:NUM FIELD > NUM (numeric).", &handlerNumGT, 756 "eq", "<field-list>:NUM FIELD == NUM (numeric).", &handlerNumEQ, 757 "ne", "<field-list>:NUM FIELD != NUM (numeric).", &handlerNumNE, 758 759 "str-le", "<field-list>:STR FIELD <= STR (string).", &handlerStrLE, 760 "str-lt", "<field-list>:STR FIELD < STR (string).", &handlerStrLT, 761 "str-ge", "<field-list>:STR FIELD >= STR (string).", &handlerStrGE, 762 "str-gt", "<field-list>:STR FIELD > STR (string).", &handlerStrGT, 763 "str-eq", "<field-list>:STR FIELD == STR (string).", &handlerStrEQ, 764 "istr-eq", "<field-list>:STR FIELD == STR (string, case-insensitive).", &handlerIStrEQ, 765 "str-ne", "<field-list>:STR FIELD != STR (string).", &handlerStrNE, 766 "istr-ne", "<field-list>:STR FIELD != STR (string, case-insensitive).", &handlerIStrNE, 767 "str-in-fld", "<field-list>:STR FIELD contains STR (substring search).", &handlerStrInFld, 768 "istr-in-fld", "<field-list>:STR FIELD contains STR (substring search, case-insensitive).", &handlerIStrInFld, 769 "str-not-in-fld", "<field-list>:STR FIELD does not contain STR (substring search).", &handlerStrNotInFld, 770 "istr-not-in-fld", "<field-list>:STR FIELD does not contain STR (substring search, case-insensitive).", &handlerIStrNotInFld, 771 772 "regex", "<field-list>:REGEX FIELD matches regular expression.", &handlerRegexMatch, 773 "iregex", "<field-list>:REGEX FIELD matches regular expression, case-insensitive.", &handlerIRegexMatch, 774 "not-regex", "<field-list>:REGEX FIELD does not match regular expression.", &handlerRegexNotMatch, 775 "not-iregex", "<field-list>:REGEX FIELD does not match regular expression, case-insensitive.", &handlerIRegexNotMatch, 776 777 "char-len-le", "<field-list>:NUM character-length(FIELD) <= NUM.", &handlerCharLenLE, 778 "char-len-lt", "<field-list>:NUM character-length(FIELD) < NUM.", &handlerCharLenLT, 779 "char-len-ge", "<field-list>:NUM character-length(FIELD) >= NUM.", &handlerCharLenGE, 780 "char-len-gt", "<field-list>:NUM character-length(FIELD) > NUM.", &handlerCharLenGT, 781 "char-len-eq", "<field-list>:NUM character-length(FIELD) == NUM.", &handlerCharLenEQ, 782 "char-len-ne", "<field-list>:NUM character-length(FIELD) != NUM.", &handlerCharLenNE, 783 784 "byte-len-le", "<field-list>:NUM byte-length(FIELD) <= NUM.", &handlerByteLenLE, 785 "byte-len-lt", "<field-list>:NUM byte-length(FIELD) < NUM.", &handlerByteLenLT, 786 "byte-len-ge", "<field-list>:NUM byte-length(FIELD) >= NUM.", &handlerByteLenGE, 787 "byte-len-gt", "<field-list>:NUM byte-length(FIELD) > NUM.", &handlerByteLenGT, 788 "byte-len-eq", "<field-list>:NUM byte-length(FIELD) == NUM.", &handlerByteLenEQ, 789 "byte-len-ne", "<field-list>:NUM byte-length(FIELD) != NUM.", &handlerByteLenNE, 790 791 "ff-le", "FIELD1:FIELD2 FIELD1 <= FIELD2 (numeric).", &handlerFFLE, 792 "ff-lt", "FIELD1:FIELD2 FIELD1 < FIELD2 (numeric).", &handlerFFLT, 793 "ff-ge", "FIELD1:FIELD2 FIELD1 >= FIELD2 (numeric).", &handlerFFGE, 794 "ff-gt", "FIELD1:FIELD2 FIELD1 > FIELD2 (numeric).", &handlerFFGT, 795 "ff-eq", "FIELD1:FIELD2 FIELD1 == FIELD2 (numeric).", &handlerFFEQ, 796 "ff-ne", "FIELD1:FIELD2 FIELD1 != FIELD2 (numeric).", &handlerFFNE, 797 "ff-str-eq", "FIELD1:FIELD2 FIELD1 == FIELD2 (string).", &handlerFFStrEQ, 798 "ff-istr-eq", "FIELD1:FIELD2 FIELD1 == FIELD2 (string, case-insensitive).", &handlerFFIStrEQ, 799 "ff-str-ne", "FIELD1:FIELD2 FIELD1 != FIELD2 (string).", &handlerFFStrNE, 800 "ff-istr-ne", "FIELD1:FIELD2 FIELD1 != FIELD2 (string, case-insensitive).", &handlerFFIStrNE, 801 802 "ff-absdiff-le", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) <= NUM", &handlerFFAbsDiffLE, 803 "ff-absdiff-gt", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) > NUM", &handlerFFAbsDiffGT, 804 "ff-reldiff-le", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) / min(abs(FIELD1), abs(FIELD2)) <= NUM", &handlerFFRelDiffLE, 805 "ff-reldiff-gt", "FIELD1:FIELD2:NUM abs(FIELD1 - FIELD2) / min(abs(FIELD1), abs(FIELD2)) > NUM", &handlerFFRelDiffGT, 806 ); 807 808 /* Both help texts are a bit long. In this case, for "regular" help, don't 809 * print options, just the text. The text summarizes the options. 810 */ 811 if (r.helpWanted) 812 { 813 stdout.write(helpText); 814 return tuple(false, 0); 815 } 816 else if (helpVerbose) 817 { 818 defaultGetoptPrinter(helpTextVerbose, r.options); 819 return tuple(false, 0); 820 } 821 else if (helpOptions) 822 { 823 defaultGetoptPrinter(helpTextOptions, r.options); 824 return tuple(false, 0); 825 } 826 else if (versionWanted) 827 { 828 import tsv_utils.common.tsvutils_version; 829 writeln(tsvutilsVersionNotice("tsv-filter")); 830 return tuple(false, 0); 831 } 832 833 /* Input files. Remaining command line args are files. */ 834 string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 835 cmdArgs.length = 1; 836 ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader; 837 inputSources = inputSourceRange(filepaths, readHeader); 838 } 839 catch (Exception e) 840 { 841 stderr.writefln("[%s] Error processing command line arguments: %s", programName, e.msg); 842 return tuple(false, 1); 843 } 844 return tuple(true, 0); 845 } 846 } 847 848 /** tsvFilter processes the input files and runs the tests. 849 */ 850 void tsvFilter(ref TsvFilterOptions cmdopt) 851 { 852 import std.algorithm : all, any, splitter; 853 import std.range; 854 import tsv_utils.common.utils : BufferedOutputRange, bufferedByLine, InputSourceRange, 855 throwIfWindowsNewlineOnUnix; 856 857 /* inputSources must be an InputSourceRange and include at least stdin. */ 858 assert(!cmdopt.inputSources.empty); 859 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 860 861 /* BufferedOutputRange improves performance on narrow files with high percentages of 862 * writes. Want responsive output if output is rare, so ensure the first matched 863 * line is written, and that writes separated by long stretches of non-matched lines 864 * are written. 865 */ 866 enum maxInputLinesWithoutBufferFlush = 1024; 867 size_t inputLinesWithoutBufferFlush = maxInputLinesWithoutBufferFlush + 1; 868 869 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 870 871 /* First header is read during command line argument processing. */ 872 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 873 { 874 auto inputStream = cmdopt.inputSources.front; 875 throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 876 bufferedOutput.appendln(inputStream.header); 877 } 878 879 /* Process each input file, one line at a time. */ 880 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 881 auto lineFields = new char[][](cmdopt.maxFieldIndex + 1); 882 883 foreach (inputStream; cmdopt.inputSources) 884 { 885 if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 886 887 foreach (lineNum, line; inputStream.file.bufferedByLine.enumerate(fileBodyStartLine)) 888 { 889 if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, lineNum); 890 891 /* Copy the needed number of fields to the fields array. */ 892 int fieldIndex = -1; 893 foreach (fieldValue; line.splitter(cmdopt.delim)) 894 { 895 if (fieldIndex == cast(long) cmdopt.maxFieldIndex) break; 896 fieldIndex++; 897 lineFields[fieldIndex] = fieldValue; 898 } 899 900 if (fieldIndex == -1) 901 { 902 assert(line.length == 0); 903 /* Bug work-around. Currently empty lines are not handled properly by splitter. 904 * Bug: https://issues.dlang.org/show_bug.cgi?id=15735 905 * Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030 906 * Work-around: Point to the line. It's an empty string. 907 */ 908 fieldIndex++; 909 lineFields[fieldIndex] = line; 910 } 911 912 enforce(fieldIndex >= cast(long) cmdopt.maxFieldIndex, 913 format("Not enough fields in line. File: %s, Line: %s", 914 inputStream.name, lineNum)); 915 916 /* Run the tests. Tests will fail (throw) if a field cannot be converted 917 * to the expected type. 918 */ 919 try 920 { 921 inputLinesWithoutBufferFlush++; 922 bool passed = cmdopt.disjunct ? 923 cmdopt.tests.any!(x => x(lineFields)) : 924 cmdopt.tests.all!(x => x(lineFields)); 925 if (cmdopt.invert) passed = !passed; 926 if (passed) 927 { 928 const bool wasFlushed = bufferedOutput.appendln(line); 929 if (wasFlushed) inputLinesWithoutBufferFlush = 0; 930 else if (inputLinesWithoutBufferFlush > maxInputLinesWithoutBufferFlush) 931 { 932 bufferedOutput.flush; 933 inputLinesWithoutBufferFlush = 0; 934 } 935 } 936 } 937 catch (Exception e) 938 { 939 throw new Exception( 940 format("Could not process line or field: %s\n File: %s Line: %s%s", 941 e.msg, inputStream.name, lineNum, 942 (lineNum == 1) ? "\n Is this a header line? Use --header to skip." : "")); 943 } 944 } 945 } 946 }