1 /**
2 Command line tool that filters TSV files.
3 
4 This tool filters tab-delimited files based on numeric or string comparisons
5 against specific fields. See the helpText string for details.
6 
7 Copyright (c) 2015-2019, eBay Software Foundation
8 Initially written by Jon Degenhardt
9 
10 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
11 */
12 module tsv_utils.tsv_filter;
13 
14 import std.algorithm : canFind, equal, findSplit, max, min;
15 import std.conv : to;
16 import std.format : format;
17 import std.math : abs, isFinite, isInfinity, isNaN;
18 import std.regex;
19 import std.stdio;
20 import std..string : isNumeric;
21 import std.typecons : tuple;
22 import std.uni: asLowerCase, toLower;
23 
24 /* The program has two main parts, command line arg processing and processing the input
25  * files. Much of the work is in command line arg processing. This sets up the tests run
26  * against each input line. The tests are an array of delegates (closures) run against the
27  * fields in the line. The tests are based on command line arguments, of which there is
28  * a lengthy set, one for each test.
29  */
30 
31 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
32 
33 /** Main program. Invokes command line arg processing and tsv-filter to perform
34  * the real work. Any errors are caught and reported.
35  */
36 int main(string[] cmdArgs)
37 {
38     /* When running in DMD code coverage mode, turn on report merging. */
39     version(D_Coverage) version(DigitalMars)
40     {
41         import core.runtime : dmd_coverSetMerge;
42         dmd_coverSetMerge(true);
43     }
44 
45     TsvFilterOptions cmdopt;
46     const r = cmdopt.processArgs(cmdArgs);
47     if (!r[0]) return r[1];
48     version(LDC_Profile)
49     {
50         import ldc.profile : resetAll;
51         resetAll();
52     }
53     try tsvFilter(cmdopt, cmdArgs[1..$]);
54     catch (Exception exc)
55     {
56         stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
57         return 1;
58     }
59     return 0;
60 }
61 
62 immutable helpText = q"EOS
63 Synopsis: tsv-filter [options] [file...]
64 
65 Filter tab-delimited files for matching lines via comparison tests against
66 individual fields. Use '--help-verbose' for a more detailed description.
67 
68 Global options:
69   --help-verbose      Print full help.
70   --help-options      Print the options list by itself.
71   --V|version         Print version information and exit.
72   --H|header          Treat the first line of each file as a header.
73   --or                Evaluate tests as an OR rather than an AND clause.
74   --v|invert          Invert the filter, printing lines that do not match.
75   --d|delimiter CHR   Field delimiter. Default: TAB.
76 
77 Operators:
78 * Test if a field is empty (no characters) or blank (empty or whitespace only).
79   Syntax:  --empty|not-empty|blank|not-blank  FIELD
80   Example: --empty 5          // True if field 5 is empty
81 
82 * Test if a field is numeric, finite, NaN, or infinity
83   Syntax:  --is-numeric|is-finite|is-nan|is-infinity FIELD
84   Example: --is-numeric 5 --gt 5:100  // Ensure field 5 is numeric before --gt test.
85 
86 * Compare a field to a number (integer or float)
87   Syntax:  --eq|ne|lt|le|gt|ge  FIELD:NUM
88   Example: --lt 5:1000 --gt 2:0.5  // True if (field 5 < 1000) and (field 2 > 0.5)
89 
90 * Compare a field to a string
91   Syntax:  --str-eq|str-ne  FIELD:STR
92   Example: --str-eq 3:abc        // True if field 3 is "abc"
93 
94 * Test if a field contains a string (substring search)
95   Syntax:  --str-in-fld|str-not-in-fld|istr-in-fld|istr-not-in-fld  FIELD:STR
96   Example: --str-in-fld 1:hello  // True if field 1 contains "hello"
97 
98 * Test if a field matches a regular expression.
99   Syntax:  --regex|iregex|not-regex|not-iregex  FIELD:REGEX
100   Example: --regex '3:ab*c'      // True if field 3 contains "ac", "abc", "abbc", etc.
101 
102 * Field to field comparisons - Similar to field vs literal comparisons, but field vs field.
103   Syntax:  --ff-eq|ff-ne|ff-lt|ff-le|ff-gt|ff-ge  FIELD1:FIELD2
104            --ff-str-eq|ff-str-ne|ff-istr-eq|ff-istr-ne  FIELD1:FIELD2
105   Example: --ff-eq 2:4           // True if fields 2 and 4 are numerically equivalent
106            --ff-str-eq 2:4       // True if fields 2 and 4 are the same strings
107 
108 * Field to field difference comparisons - Absolute and relative difference
109   Syntax:  --ff-absdiff-le|ff-absdiff-gt FIELD1:FIELD2:NUM
110            --ff-reldiff-le|ff-reldiff-gt FIELD1:FIELD2:NUM
111   Example: --ff-absdiff-lt 1:3:0.25   // True if abs(field1 - field2) < 0.25
112 
113 EOS";
114 
115 immutable helpTextVerbose = q"EOS
116 Synopsis: tsv-filter [options] [file...]
117 
118 Filter lines of tab-delimited files via comparison tests against fields. Multiple
119 tests can be specified, by default they are evaluated as AND clause. Lines
120 satisfying the tests are written to standard output.
121 
122 Typical test syntax is '--op field:value', where 'op' is an operator, 'field' is a
123 1-based field index, and 'value' is the comparison basis. For example, '--lt 3:500'
124 tests if field 3 is less than 500. A more complete example:
125 
126   tsv-filter --header --gt 1:50 --lt 1:100 --le 2:1000 data.tsv
127 
128 This outputs all lines from file data.tsv where field 1 is greater than 50 and less
129 than 100, and field 2 is less than or equal to 1000. The header is also output.
130 
131 Tests available include:
132   * Test if a field is empty (no characters) or blank (empty or whitespace only).
133   * Test if a field is interpretable as a number, a finite number, NaN, or Infinity.
134   * Compare a field to a number - Numeric equality and relational tests.
135   * Compare a field to a string - String equality and relational tests.
136   * Test if a field matches a regular expression. Case sensitive or insensitive.
137   * Test if a field contains a string. Sub-string search, case sensitive or insensitive.
138   * Field to field comparisons - Similar to the other tests, except comparing
139     one field to another in the same line.
140 
141 Details:
142   * The run is aborted if there are not enough fields in an input line.
143   * Numeric tests will fail and abort the run if a field cannot be interpreted as a
144     number. This includes fields with no text. To avoid this use '--is-numeric' or
145     '--is-finite' prior to the numeric test. For example, '--is-numeric 5 --gt 5:100'
146     ensures field 5 is numeric before running the --gt test.
147   * Regular expression syntax is defined by the D programming language. They follow
148     common conventions (perl, python, etc.). Most common forms work as expected.
149 
150 Options:
151 EOS";
152 
153 immutable helpTextOptions = q"EOS
154 Synopsis: tsv-filter [options] [file...]
155 
156 Options:
157 EOS";
158 
159 /* The next blocks of code define the structure of the boolean tests run against input lines.
160  * This includes function and delegate (closure) signatures, creation mechanisms, option
161  * handlers, etc. Command line arg processing to build the test structure.
162 */
163 
164 /* FieldsPredicate delegate signature - Each input line is run against a set of boolean
165  * tests. Each test is a 'FieldsPredicate'. A FieldsPredicate is a delegate (closure)
166  * containing all info about the test except the field values of the line being tested.
167  * These delegates are created as part of command line arg processing. The wrapped data
168  * includes operation, field indexes, literal values, etc. At run-time the delegate is
169  * passed one argument, the split input line.
170  */
171 alias FieldsPredicate = bool delegate(const char[][] fields);
172 
173 /* FieldsPredicate function signatures - These aliases represent the different function
174  * signatures used in FieldsPredicate delegates. Each alias has a corresponding 'make'
175  * function. The 'make' function takes a real predicate function and closure args and
176  * returns a FieldsPredicate delegate. Predicates types are:
177  *
178  * - FieldUnaryPredicate - Test based on a single field. (e.g. --empty 4)
179  * - FieldVsNumberPredicate - Test based on a field index (used to get the field value)
180  *   and a fixed numeric value. For example, field 2 less than 100 (--lt 2:100).
181  * - FieldVsStringPredicate - Test based on a field and a string. (e.g. --str-eq 2:abc)
182  * - FieldVsIStringPredicate - Case-insensitive test based on a field and a string.
183  *   (e.g. --istr-eq 2:abc)
184  * - FieldVsRegexPredicate - Test based on a field and a regex. (e.g. --regex '2:ab*c')
185  * - FieldVsFieldPredicate - Test based on two fields. (e.g. --ff-le 2:4).
186  *
187  * An actual FieldsPredicate takes the fields from the line and the closure args and
188  * runs the test. For example, a function testing if a field is less than a specific
189  * value would pull the specified field from the fields array, convert the string to
190  * a number, then run the less-than test.
191  */
192 alias FieldUnaryPredicate    = bool function(const char[][] fields, size_t index);
193 alias FieldVsNumberPredicate = bool function(const char[][] fields, size_t index, double value);
194 alias FieldVsStringPredicate = bool function(const char[][] fields, size_t index, string value);
195 alias FieldVsIStringPredicate = bool function(const char[][] fields, size_t index, dstring value);
196 alias FieldVsRegexPredicate  = bool function(const char[][] fields, size_t index, Regex!char value);
197 alias FieldVsFieldPredicate  = bool function(const char[][] fields, size_t index1, size_t index2);
198 alias FieldFieldNumPredicate  = bool function(const char[][] fields, size_t index1, size_t index2, double value);
199 
200 FieldsPredicate makeFieldUnaryDelegate(FieldUnaryPredicate fn, size_t index)
201 {
202     return fields => fn(fields, index);
203 }
204 
205 FieldsPredicate makeFieldVsNumberDelegate(FieldVsNumberPredicate fn, size_t index, double value)
206 {
207     return fields => fn(fields, index, value);
208 }
209 
210 FieldsPredicate makeFieldVsStringDelegate(FieldVsStringPredicate fn, size_t index, string value)
211 {
212     return fields => fn(fields, index, value);
213 }
214 
215 FieldsPredicate makeFieldVsIStringDelegate(FieldVsIStringPredicate fn, size_t index, dstring value)
216 {
217     return fields => fn(fields, index, value);
218 }
219 
220 FieldsPredicate makeFieldVsRegexDelegate(FieldVsRegexPredicate fn, size_t index, Regex!char value)
221 {
222     return fields => fn(fields, index, value);
223 }
224 
225 FieldsPredicate makeFieldVsFieldDelegate(FieldVsFieldPredicate fn, size_t index1, size_t index2)
226 {
227     return fields => fn(fields, index1, index2);
228 }
229 
230 FieldsPredicate makeFieldFieldNumDelegate(FieldFieldNumPredicate fn, size_t index1, size_t index2, double value)
231 {
232     return fields => fn(fields, index1, index2, value);
233 }
234 
235 /* Predicate functions - These are the actual functions used in a FieldsPredicate. They
236  * are a direct reflection of the operators available via command line args. Each matches
237  * one of the FieldsPredicate function aliases defined above.
238  */
239 bool fldEmpty(const char[][] fields, size_t index) { return fields[index].length == 0; }
240 bool fldNotEmpty(const char[][] fields, size_t index) { return fields[index].length != 0; }
241 bool fldBlank(const char[][] fields, size_t index) { return cast(bool) fields[index].matchFirst(ctRegex!`^\s*$`); }
242 bool fldNotBlank(const char[][] fields, size_t index) { return !fields[index].matchFirst(ctRegex!`^\s*$`); }
243 
244 bool fldIsNumeric(const char[][] fields, size_t index) { return fields[index].isNumeric; }
245 bool fldIsFinite(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isFinite; }
246 bool fldIsNaN(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isNaN; }
247 bool fldIsInfinity(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isInfinity; }
248 
249 bool numLE(const char[][] fields, size_t index, double val) { return fields[index].to!double <= val; }
250 bool numLT(const char[][] fields, size_t index, double val) { return fields[index].to!double  < val; }
251 bool numGE(const char[][] fields, size_t index, double val) { return fields[index].to!double >= val; }
252 bool numGT(const char[][] fields, size_t index, double val) { return fields[index].to!double  > val; }
253 bool numEQ(const char[][] fields, size_t index, double val) { return fields[index].to!double == val; }
254 bool numNE(const char[][] fields, size_t index, double val) { return fields[index].to!double != val; }
255 
256 bool strLE(const char[][] fields, size_t index, string val) { return fields[index] <= val; }
257 bool strLT(const char[][] fields, size_t index, string val) { return fields[index]  < val; }
258 bool strGE(const char[][] fields, size_t index, string val) { return fields[index] >= val; }
259 bool strGT(const char[][] fields, size_t index, string val) { return fields[index]  > val; }
260 bool strEQ(const char[][] fields, size_t index, string val) { return fields[index] == val; }
261 bool strNE(const char[][] fields, size_t index, string val) { return fields[index] != val; }
262 bool strInFld(const char[][] fields, size_t index, string val) { return fields[index].canFind(val); }
263 bool strNotInFld(const char[][] fields, size_t index, string val) { return !fields[index].canFind(val); }
264 
265 /* Note: For istr predicates, the command line value has been lower-cased by fieldVsIStringOptionHander.
266  */
267 bool istrEQ(const char[][] fields, size_t index, dstring val) { return fields[index].asLowerCase.equal(val); }
268 bool istrNE(const char[][] fields, size_t index, dstring val) { return !fields[index].asLowerCase.equal(val); }
269 bool istrInFld(const char[][] fields, size_t index, dstring val) { return fields[index].asLowerCase.canFind(val); }
270 bool istrNotInFld(const char[][] fields, size_t index, dstring val) { return !fields[index].asLowerCase.canFind(val); }
271 
272 /* Note: Case-sensitivity is built into the regex value, so these regex predicates are
273  * used for both case-sensitive and case-insensitive regex operators.
274  */
275 bool regexMatch(const char[][] fields, size_t index, Regex!char val) { return cast(bool) fields[index].matchFirst(val); }
276 bool regexNotMatch(const char[][] fields, size_t index, Regex!char val) { return !fields[index].matchFirst(val); }
277 
278 bool ffLE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double <= fields[index2].to!double; }
279 bool ffLT(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double  < fields[index2].to!double; }
280 bool ffGE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double >= fields[index2].to!double; }
281 bool ffGT(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double  > fields[index2].to!double; }
282 bool ffEQ(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double == fields[index2].to!double; }
283 bool ffNE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double != fields[index2].to!double; }
284 bool ffStrEQ(const char[][] fields, size_t index1, size_t index2) { return fields[index1] == fields[index2]; }
285 bool ffStrNE(const char[][] fields, size_t index1, size_t index2) { return fields[index1] != fields[index2]; }
286 bool ffIStrEQ(const char[][] fields, size_t index1, size_t index2)
287 {
288     return equal(fields[index1].asLowerCase, fields[index2].asLowerCase);
289 }
290 bool ffIStrNE(const char[][] fields, size_t index1, size_t index2)
291 {
292     return !equal(fields[index1].asLowerCase, fields[index2].asLowerCase);
293 }
294 
295 auto AbsDiff(double v1, double v2) { return (v1 - v2).abs; }
296 auto RelDiff(double v1, double v2) { return (v1 - v2).abs / min(v1.abs, v2.abs); }
297 
298 bool ffAbsDiffLE(const char[][] fields, size_t index1, size_t index2, double value)
299 {
300     return AbsDiff(fields[index1].to!double, fields[index2].to!double) <= value;
301 }
302 bool ffAbsDiffGT(const char[][] fields, size_t index1, size_t index2, double value)
303 {
304     return AbsDiff(fields[index1].to!double, fields[index2].to!double) > value;
305 }
306 bool ffRelDiffLE(const char[][] fields, size_t index1, size_t index2, double value)
307 {
308     return RelDiff(fields[index1].to!double, fields[index2].to!double) <= value;
309 }
310 bool ffRelDiffGT(const char[][] fields, size_t index1, size_t index2, double value)
311 {
312     return RelDiff(fields[index1].to!double, fields[index2].to!double) > value;
313 }
314 
315 /* Command line option handlers - There is a command line option handler for each
316  * predicate type. That is, one each for FieldUnaryPredicate, FieldVsNumberPredicate,
317  * etc. Option handlers are passed the tests array, the predicate function, and the
318  * command line option arguments. A FieldsPredicate delegate is created and appended to
319  * the tests array. An exception is thrown if errors are detected while processing the
320  * option, the error text is intended for the end user.
321  *
322  * These option handlers have similar functionality, differing in option processing and
323  * error message generation. fieldVsNumberOptionHandler is described as an example. It
324  * handles command options such as '--lt 3:1000', which tests field 3 for a values less
325  * than 1000. It is passed the tests array, the 'numLE' function to use for the test, and
326  * the string "3:1000" representing the option value. It parses the option value into
327  * field index (unsigned int) and value (double). These are wrapped in a FieldsPredicate
328  * which is added to the tests array. An error is signaled if the option string is invalid.
329  *
330  * During processing, fields indexes are converted from one-based to zero-based. As an
331  * optimization, the maximum field index is also tracked. This allows early termination of
332  * line splitting.
333  */
334 
335 void fieldUnaryOptionHandler(
336     ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldUnaryPredicate fn, string option, string optionVal)
337 {
338     size_t field;
339     try field = optionVal.to!size_t;
340     catch (Exception exc)
341     {
342         throw new Exception(
343             format("Invalid value in option: '--%s %s'. Expected: '--%s <field>' where field is a 1-upped integer.",
344                    option, optionVal, option));
345     }
346 
347     if (field == 0)
348     {
349         throw new Exception(
350             format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal));
351     }
352 
353     immutable size_t zeroBasedIndex = field - 1;
354     tests ~= makeFieldUnaryDelegate(fn, zeroBasedIndex);
355     maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex;
356 }
357 
358 void fieldVsNumberOptionHandler(
359     ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsNumberPredicate fn, string option, string optionVal)
360 {
361     immutable valSplit = findSplit(optionVal, ":");
362     if (valSplit[1].length == 0 || valSplit[2].length == 0)
363     {
364         throw new Exception(
365             format("Invalid option: '%s %s'. Expected: '%s <field>:<val>' where <field> and <val> are numbers.",
366                    option, optionVal, option));
367     }
368     size_t field;
369     double value;
370     try
371     {
372         field = valSplit[0].to!size_t;
373         value = valSplit[2].to!double;
374     }
375     catch (Exception exc)
376     {
377         throw new Exception(
378             format("Invalid numeric values in option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> and <val> are numbers.",
379                    option, optionVal, option));
380     }
381 
382     if (field == 0)
383     {
384         throw new Exception(
385             format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal));
386     }
387     immutable size_t zeroBasedIndex = field - 1;
388     tests ~= makeFieldVsNumberDelegate(fn, zeroBasedIndex, value);
389     maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex;
390 }
391 
392 void fieldVsStringOptionHandler(
393     ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsStringPredicate fn, string option, string optionVal)
394 {
395     immutable valSplit = findSplit(optionVal, ":");
396     if (valSplit[1].length == 0 || valSplit[2].length == 0)
397     {
398         throw new Exception(
399             format("Invalid option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> is a string.",
400                    option, optionVal, option));
401     }
402     size_t field;
403     string value;
404     try
405     {
406         field = valSplit[0].to!size_t;
407         value = valSplit[2].to!string;
408     }
409     catch (Exception exc)
410     {
411         throw new Exception(
412             format("Invalid values in option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> a string.",
413                    option, optionVal, option));
414     }
415 
416     if (field == 0)
417     {
418         throw new Exception(
419             format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal));
420     }
421     immutable size_t zeroBasedIndex = field - 1;
422     tests ~= makeFieldVsStringDelegate(fn, zeroBasedIndex, value);
423     maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex;
424 }
425 
426 /* The fieldVsIStringOptionHandler lower-cases the command line argument, assuming the
427  * case-insensitive comparison will be done on lower-cased values.
428  */
429 void fieldVsIStringOptionHandler(
430     ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsIStringPredicate fn, string option, string optionVal)
431 {
432     immutable valSplit = findSplit(optionVal, ":");
433     if (valSplit[1].length == 0 || valSplit[2].length == 0)
434     {
435         throw new Exception(
436             format("Invalid option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> is a string.",
437                    option, optionVal, option));
438     }
439     size_t field;
440     string value;
441     try
442     {
443         field = valSplit[0].to!size_t;
444         value = valSplit[2].to!string;
445     }
446     catch (Exception exc)
447     {
448         throw new Exception(
449             format("Invalid values in option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> a string.",
450                    option, optionVal, option));
451     }
452 
453     if (field == 0)
454     {
455         throw new Exception(
456             format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal));
457     }
458     immutable size_t zeroBasedIndex = field - 1;
459     tests ~= makeFieldVsIStringDelegate(fn, zeroBasedIndex, value.to!dstring.toLower);
460     maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex;
461 }
462 
463 void fieldVsRegexOptionHandler(
464     ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsRegexPredicate fn, string option, string optionVal,
465     bool caseSensitive)
466 {
467     immutable valSplit = findSplit(optionVal, ":");
468     if (valSplit[1].length == 0 || valSplit[2].length == 0)
469     {
470         throw new Exception(
471             format("Invalid option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> is a regular expression.",
472                    option, optionVal, option));
473     }
474     size_t field;
475     Regex!char value;
476     try
477     {
478         immutable modifiers = caseSensitive ? "" : "i";
479         field = valSplit[0].to!size_t;
480         value = regex(valSplit[2], modifiers);
481     }
482     catch (Exception exc)
483     {
484         throw new Exception(
485             format("Invalid values in option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> is a regular expression.",
486                    option, optionVal, option));
487     }
488 
489     if (field == 0)
490     {
491         throw new Exception(
492             format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal));
493     }
494     immutable size_t zeroBasedIndex = field - 1;
495     tests ~= makeFieldVsRegexDelegate(fn, zeroBasedIndex, value);
496     maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex;
497 }
498 
499 void fieldVsFieldOptionHandler(
500     ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsFieldPredicate fn, string option, string optionVal)
501 {
502     immutable valSplit = findSplit(optionVal, ":");
503     if (valSplit[1].length == 0 || valSplit[2].length == 0)
504     {
505         throw new Exception(
506             format("Invalid option: '--%s %s'. Expected: '--%s <field1>:<field2>' where fields are 1-upped integers.",
507                    option, optionVal, option));
508     }
509     size_t field1;
510     size_t field2;
511     try
512     {
513         field1 = valSplit[0].to!size_t;
514         field2 = valSplit[2].to!size_t;
515     }
516     catch (Exception exc)
517     {
518         throw new Exception(
519             format("Invalid values in option: '--%s %s'. Expected: '--%s <field1>:<field2>' where fields are 1-upped integers.",
520                    option, optionVal, option));
521     }
522 
523     if (field1 == 0 || field2 == 0)
524     {
525         throw new Exception(
526             format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal));
527     }
528 
529     if (field1 == field2)
530     {
531         throw new Exception(
532             format("Invalid option: '--%s %s'. Field1 and field2 must be different fields", option, optionVal));
533     }
534 
535     immutable size_t zeroBasedIndex1 = field1 - 1;
536     immutable size_t zeroBasedIndex2 = field2 - 1;
537     tests ~= makeFieldVsFieldDelegate(fn, zeroBasedIndex1, zeroBasedIndex2);
538     maxFieldIndex = max(maxFieldIndex, zeroBasedIndex1, zeroBasedIndex2);
539 }
540 
541 
542 void fieldFieldNumOptionHandler(
543     ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldFieldNumPredicate fn, string option, string optionVal)
544 {
545     size_t field1;
546     size_t field2;
547     double value;
548     immutable valSplit = findSplit(optionVal, ":");
549     auto invalidOption = (valSplit[1].length == 0 || valSplit[2].length == 0);
550 
551     if (!invalidOption)
552     {
553         immutable valSplit2 = findSplit(valSplit[2], ":");
554         invalidOption = (valSplit2[1].length == 0 || valSplit2[2].length == 0);
555 
556         if (!invalidOption)
557         {
558             try
559             {
560                 field1 = valSplit[0].to!size_t;
561                 field2 = valSplit2[0].to!size_t;
562                 value = valSplit2[2].to!double;
563             }
564             catch (Exception exc)
565             {
566                 invalidOption = true;
567             }
568         }
569     }
570 
571     if (invalidOption)
572     {
573         throw new Exception(
574             format("Invalid values in option: '--%s %s'. Expected: '--%s <field1>:<field2>:<num>' where fields are 1-upped integers.",
575                    option, optionVal, option));
576     }
577     if (field1 == 0 || field2 == 0)
578     {
579         throw new Exception(
580             format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal));
581     }
582     if (field1 == field2)
583     {
584         throw new Exception(
585             format("Invalid option: '--%s %s'. Field1 and field2 must be different fields", option, optionVal));
586     }
587 
588     immutable size_t zeroBasedIndex1 = field1 - 1;
589     immutable size_t zeroBasedIndex2 = field2 - 1;
590     tests ~= makeFieldFieldNumDelegate(fn, zeroBasedIndex1, zeroBasedIndex2, value);
591     maxFieldIndex = max(maxFieldIndex, zeroBasedIndex1, zeroBasedIndex2);
592 }
593 
594 /** Command line options - This struct holds the results of command line option processing.
595  * It also has a method, processArgs, that invokes command line arg processing.
596  */
597 struct TsvFilterOptions
598 {
599     string programName;
600     FieldsPredicate[] tests;         // Derived from tests
601     size_t maxFieldIndex;            // Derived from tests
602     bool hasHeader = false;          // --H|header
603     bool invert = false;             // --invert
604     bool disjunct = false;           // --or
605     char delim = '\t';               // --delimiter
606     bool helpVerbose = false;        // --help-verbose
607     bool helpOptions = false;        // --help-options
608     bool versionWanted = false;      // --V|version
609 
610     /* Returns a tuple. First value is true if command line arguments were successfully
611      * processed and execution should continue, or false if an error occurred or the user
612      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
613      *
614      * Returning true (execution continues) means args have been validated and the
615      * tests array has been established.
616      */
617     auto processArgs (ref string[] cmdArgs)
618     {
619         import std.getopt;
620         import std.path : baseName, stripExtension;
621         import tsv_utils.common.getopt_inorder;
622 
623         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
624 
625         /* Command option handlers - One handler for each option. These conform to the
626          * getopt required handler signature, and separate knowledge the specific command
627          * option text from the option processing.
628          */
629         void handlerFldEmpty(string option, string value)    { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldEmpty,    option, value); }
630         void handlerFldNotEmpty(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldNotEmpty, option, value); }
631         void handlerFldBlank(string option, string value)    { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldBlank,    option, value); }
632         void handlerFldNotBlank(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldNotBlank, option, value); }
633 
634         void handlerFldIsNumeric(string option, string value)  { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsNumeric, option, value); }
635         void handlerFldIsFinite(string option, string value)   { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsFinite, option, value); }
636         void handlerFldIsNaN(string option, string value)      { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsNaN, option, value); }
637         void handlerFldIsInfinity(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsInfinity, option, value); }
638 
639         void handlerNumLE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numLE, option, value); }
640         void handlerNumLT(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numLT, option, value); }
641         void handlerNumGE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numGE, option, value); }
642         void handlerNumGT(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numGT, option, value); }
643         void handlerNumEQ(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numEQ, option, value); }
644         void handlerNumNE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numNE, option, value); }
645 
646         void handlerStrLE(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strLE, option, value); }
647         void handlerStrLT(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strLT, option, value); }
648         void handlerStrGE(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strGE, option, value); }
649         void handlerStrGT(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strGT, option, value); }
650         void handlerStrEQ(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strEQ, option, value); }
651         void handlerStrNE(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strNE, option, value); }
652 
653         void handlerStrInFld(string option, string value)    { fieldVsStringOptionHandler(tests, maxFieldIndex, &strInFld,    option, value); }
654         void handlerStrNotInFld(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strNotInFld, option, value); }
655 
656         void handlerIStrEQ(string option, string value)       { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrEQ,       option, value); }
657         void handlerIStrNE(string option, string value)       { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrNE,       option, value); }
658         void handlerIStrInFld(string option, string value)    { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrInFld,    option, value); }
659         void handlerIStrNotInFld(string option, string value) { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrNotInFld, option, value); }
660 
661         void handlerRegexMatch(string option, string value)     { fieldVsRegexOptionHandler(tests, maxFieldIndex, &regexMatch,    option, value, true); }
662         void handlerRegexNotMatch(string option, string value)  { fieldVsRegexOptionHandler(tests, maxFieldIndex, &regexNotMatch, option, value, true); }
663         void handlerIRegexMatch(string option, string value)    { fieldVsRegexOptionHandler(tests, maxFieldIndex, &regexMatch,    option, value, false); }
664         void handlerIRegexNotMatch(string option, string value) { fieldVsRegexOptionHandler(tests, maxFieldIndex, &regexNotMatch, option, value, false); }
665 
666         void handlerFFLE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffLE, option, value); }
667         void handlerFFLT(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffLT, option, value); }
668         void handlerFFGE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffGE, option, value); }
669         void handlerFFGT(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffGT, option, value); }
670         void handlerFFEQ(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffEQ, option, value); }
671         void handlerFFNE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffNE, option, value); }
672 
673         void handlerFFStrEQ(string option, string value)  { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffStrEQ,  option, value); }
674         void handlerFFStrNE(string option, string value)  { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffStrNE,  option, value); }
675         void handlerFFIStrEQ(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffIStrEQ, option, value); }
676         void handlerFFIStrNE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffIStrNE, option, value); }
677 
678         void handlerFFAbsDiffLE(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffAbsDiffLE, option, value); }
679         void handlerFFAbsDiffGT(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffAbsDiffGT, option, value); }
680         void handlerFFRelDiffLE(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffRelDiffLE, option, value); }
681         void handlerFFRelDiffGT(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffRelDiffGT, option, value); }
682 
683         try
684         {
685             arraySep = ",";    // Use comma to separate values in command line options
686             auto r = getoptInorder(
687                 cmdArgs,
688                 "help-verbose",    "     Print full help.", &helpVerbose,
689                 "help-options",    "     Print the options list by itself.", &helpOptions,
690                  std.getopt.config.caseSensitive,
691                 "V|version",       "     Print version information and exit.", &versionWanted,
692                 "H|header",        "     Treat the first line of each file as a header.", &hasHeader,
693                 std.getopt.config.caseInsensitive,
694                 "or",              "     Evaluate tests as an OR rather than an AND.", &disjunct,
695                 std.getopt.config.caseSensitive,
696                 "v|invert",        "     Invert the filter, printing lines that do not match.", &invert,
697                 std.getopt.config.caseInsensitive,
698                 "d|delimiter",     "CHR  Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim,
699 
700                 "empty",           "FIELD       True if field is empty.", &handlerFldEmpty,
701                 "not-empty",       "FIELD       True if field is not empty.", &handlerFldNotEmpty,
702                 "blank",           "FIELD       True if field is empty or all whitespace.", &handlerFldBlank,
703                 "not-blank",       "FIELD       True if field contains a non-whitespace character.", &handlerFldNotBlank,
704 
705                 "is-numeric",      "FIELD       True if field is interpretable as a number.", &handlerFldIsNumeric,
706                 "is-finite",       "FIELD       True if field is interpretable as a number and is not NaN or infinity.", &handlerFldIsFinite,
707                 "is-nan",          "FIELD       True if field is NaN.", &handlerFldIsNaN,
708                 "is-infinity",     "FIELD       True if field is infinity.", &handlerFldIsInfinity,
709 
710                 "le",              "FIELD:NUM   FIELD <= NUM (numeric).", &handlerNumLE,
711                 "lt",              "FIELD:NUM   FIELD <  NUM (numeric).", &handlerNumLT,
712                 "ge",              "FIELD:NUM   FIELD >= NUM (numeric).", &handlerNumGE,
713                 "gt",              "FIELD:NUM   FIELD >  NUM (numeric).", &handlerNumGT,
714                 "eq",              "FIELD:NUM   FIELD == NUM (numeric).", &handlerNumEQ,
715                 "ne",              "FIELD:NUM   FIELD != NUM (numeric).", &handlerNumNE,
716 
717                 "str-le",          "FIELD:STR   FIELD <= STR (string).", &handlerStrLE,
718                 "str-lt",          "FIELD:STR   FIELD <  STR (string).", &handlerStrLT,
719                 "str-ge",          "FIELD:STR   FIELD >= STR (string).", &handlerStrGE,
720                 "str-gt",          "FIELD:STR   FIELD >  STR (string).", &handlerStrGT,
721                 "str-eq",          "FIELD:STR   FIELD == STR (string).", &handlerStrEQ,
722                 "istr-eq",         "FIELD:STR   FIELD == STR (string, case-insensitive).", &handlerIStrEQ,
723                 "str-ne",          "FIELD:STR   FIELD != STR (string).", &handlerStrNE,
724                 "istr-ne",         "FIELD:STR   FIELD != STR (string, case-insensitive).", &handlerIStrNE,
725                 "str-in-fld",      "FIELD:STR   FIELD contains STR (substring search).", &handlerStrInFld,
726                 "istr-in-fld",     "FIELD:STR   FIELD contains STR (substring search, case-insensitive).", &handlerIStrInFld,
727                 "str-not-in-fld",  "FIELD:STR   FIELD does not contain STR (substring search).", &handlerStrNotInFld,
728                 "istr-not-in-fld", "FIELD:STR   FIELD does not contain STR (substring search, case-insensitive).", &handlerIStrNotInFld,
729 
730                 "regex",           "FIELD:REGEX   FIELD matches regular expression.", &handlerRegexMatch,
731                 "iregex",          "FIELD:REGEX   FIELD matches regular expression, case-insensitive.", &handlerIRegexMatch,
732                 "not-regex",       "FIELD:REGEX   FIELD does not match regular expression.", &handlerRegexNotMatch,
733                 "not-iregex",      "FIELD:REGEX   FIELD does not match regular expression, case-insensitive.", &handlerIRegexNotMatch,
734 
735                 "ff-le",           "FIELD1:FIELD2   FIELD1 <= FIELD2 (numeric).", &handlerFFLE,
736                 "ff-lt",           "FIELD1:FIELD2   FIELD1 <  FIELD2 (numeric).", &handlerFFLT,
737                 "ff-ge",           "FIELD1:FIELD2   FIELD1 >= FIELD2 (numeric).", &handlerFFGE,
738                 "ff-gt",           "FIELD1:FIELD2   FIELD1 >  FIELD2 (numeric).", &handlerFFGT,
739                 "ff-eq",           "FIELD1:FIELD2   FIELD1 == FIELD2 (numeric).", &handlerFFEQ,
740                 "ff-ne",           "FIELD1:FIELD2   FIELD1 != FIELD2 (numeric).", &handlerFFNE,
741                 "ff-str-eq",       "FIELD1:FIELD2   FIELD1 == FIELD2 (string).", &handlerFFStrEQ,
742                 "ff-istr-eq",      "FIELD1:FIELD2   FIELD1 == FIELD2 (string, case-insensitive).", &handlerFFIStrEQ,
743                 "ff-str-ne",       "FIELD1:FIELD2   FIELD1 != FIELD2 (string).", &handlerFFStrNE,
744                 "ff-istr-ne",      "FIELD1:FIELD2   FIELD1 != FIELD2 (string, case-insensitive).", &handlerFFIStrNE,
745 
746                 "ff-absdiff-le",   "FIELD1:FIELD2:NUM   abs(FIELD1 - FIELD2) <= NUM", &handlerFFAbsDiffLE,
747                 "ff-absdiff-gt",   "FIELD1:FIELD2:NUM   abs(FIELD1 - FIELD2)  > NUM", &handlerFFAbsDiffGT,
748                 "ff-reldiff-le",   "FIELD1:FIELD2:NUM   abs(FIELD1 - FIELD2) / min(abs(FIELD1), abs(FIELD2)) <= NUM", &handlerFFRelDiffLE,
749                 "ff-reldiff-gt",   "FIELD1:FIELD2:NUM   abs(FIELD1 - FIELD2) / min(abs(FIELD1), abs(FIELD2))  > NUM", &handlerFFRelDiffGT,
750                 );
751 
752             /* Both help texts are a bit long. In this case, for "regular" help, don't
753              * print options, just the text. The text summarizes the options.
754              */
755             if (r.helpWanted)
756             {
757                 stdout.write(helpText);
758                 return tuple(false, 0);
759             }
760             else if (helpVerbose)
761             {
762                 defaultGetoptPrinter(helpTextVerbose, r.options);
763                 return tuple(false, 0);
764             }
765             else if (helpOptions)
766             {
767                 defaultGetoptPrinter(helpTextOptions, r.options);
768                 return tuple(false, 0);
769             }
770             else if (versionWanted)
771             {
772                 import tsv_utils.common.tsvutils_version;
773                 writeln(tsvutilsVersionNotice("tsv-filter"));
774                 return tuple(false, 0);
775             }
776         }
777         catch (Exception exc)
778         {
779             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
780             return tuple(false, 1);
781         }
782         return tuple(true, 0);
783     }
784 }
785 
786 /** tsvFilter processes the input files and runs the tests.
787  */
788 void tsvFilter(in TsvFilterOptions cmdopt, in string[] inputFiles)
789 {
790     import std.algorithm : all, any, splitter;
791     import std.range;
792     import tsv_utils.common.utils : BufferedOutputRange, bufferedByLine, throwIfWindowsNewlineOnUnix;
793 
794     /* BufferedOutputRange improves performance on narrow files with high percentages of
795      * writes. Want responsive output if output is rare, so ensure the first matched
796      * line is written, and that writes separated by long stretches of non-matched lines
797      * are written.
798      */
799     enum maxInputLinesWithoutBufferFlush = 1024;
800     size_t inputLinesWithoutBufferFlush = maxInputLinesWithoutBufferFlush + 1;
801 
802     auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
803 
804     /* Process each input file, one line at a time. */
805     auto lineFields = new char[][](cmdopt.maxFieldIndex + 1);
806     bool headerWritten = false;
807     foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"])
808     {
809         auto inputStream = (filename == "-") ? stdin : filename.File();
810         foreach (lineNum, line; inputStream.bufferedByLine.enumerate(1))
811         {
812             if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, lineNum);
813             if (lineNum == 1 && cmdopt.hasHeader)
814             {
815                 /* Header. Output on the first file, skip subsequent files. */
816                 if (!headerWritten)
817                 {
818                     bufferedOutput.appendln(line);
819                     headerWritten = true;
820                 }
821             }
822             else
823             {
824                 /* Copy the needed number of fields to the fields array. */
825                 int fieldIndex = -1;
826                 foreach (fieldValue; line.splitter(cmdopt.delim))
827                 {
828                     if (fieldIndex == cast(long) cmdopt.maxFieldIndex) break;
829                     fieldIndex++;
830                     lineFields[fieldIndex] = fieldValue;
831                 }
832 
833                 if (fieldIndex == -1)
834                 {
835                     assert(line.length == 0);
836                     /* Bug work-around. Currently empty lines are not handled properly by splitter.
837                      *   Bug: https://issues.dlang.org/show_bug.cgi?id=15735
838                      *   Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030
839                      * Work-around: Point to the line. It's an empty string.
840                      */
841                     fieldIndex++;
842                     lineFields[fieldIndex] = line;
843                 }
844 
845                 if (fieldIndex < cast(long) cmdopt.maxFieldIndex)
846                 {
847                     throw new Exception(
848                         format("Not enough fields in line. File: %s, Line: %s",
849                                (filename == "-") ? "Standard Input" : filename, lineNum));
850                 }
851 
852                 /* Run the tests. Tests will fail (throw) if a field cannot be converted
853                  * to the expected type.
854                  */
855                 try
856                 {
857                     inputLinesWithoutBufferFlush++;
858                     bool passed = cmdopt.disjunct ?
859                         cmdopt.tests.any!(x => x(lineFields)) :
860                         cmdopt.tests.all!(x => x(lineFields));
861                     if (cmdopt.invert) passed = !passed;
862                     if (passed)
863                     {
864                         const bool wasFlushed = bufferedOutput.appendln(line);
865                         if (wasFlushed) inputLinesWithoutBufferFlush = 0;
866                         else if (inputLinesWithoutBufferFlush > maxInputLinesWithoutBufferFlush)
867                         {
868                             bufferedOutput.flush;
869                             inputLinesWithoutBufferFlush = 0;
870                         }
871                     }
872                 }
873                 catch (Exception exc)
874                 {
875                     throw new Exception(
876                         format("Could not process line or field: %s\n  File: %s Line: %s%s",
877                                exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum,
878                                (lineNum == 1) ? "\n  Is this a header line? Use --header to skip." : ""));
879                 }
880             }
881         }
882     }
883 }