1 /**
2 Command line tool that filters TSV files.
3 
4 This tool filters tab-delimited files based on numeric or string comparisons
5 against specific fields. See the helpText string for details.
6 
7 Copyright (c) 2015-2018, eBay Software Foundation
8 Initially written by Jon Degenhardt
9 
10 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
11 */
12 module tsv_filter;
13 
14 import std.algorithm : canFind, equal, findSplit, max, min;
15 import std.conv : to;
16 import std.format : format;
17 import std.math : abs, isFinite, isInfinity, isNaN;
18 import std.regex;
19 import std.stdio;
20 import std.string : isNumeric;
21 import std.typecons : tuple;
22 import std.uni: asLowerCase, toLower;
23 
24 /* The program has two main parts, command line arg processing and processing the input
25  * files. Much of the work is in command line arg processing. This sets up the tests run
26  * against each input line. The tests are an array of delegates (closures) run against the
27  * fields in the line. The tests are based on command line arguments, of which there is
28  * a lengthy set, one for each test.
29  */
30 
31 int main(string[] cmdArgs)
32 {
33     /* When running in DMD code coverage mode, turn on report merging. */
34     version(D_Coverage) version(DigitalMars)
35     {
36         import core.runtime : dmd_coverSetMerge;
37         dmd_coverSetMerge(true);
38     }
39 
40     TsvFilterOptions cmdopt;
41     auto r = cmdopt.processArgs(cmdArgs);
42     if (!r[0]) return r[1];
43     version(LDC_Profile)
44     {
45         import ldc.profile : resetAll;
46         resetAll();
47     }
48     try tsvFilter(cmdopt, cmdArgs[1..$]);
49     catch (Exception exc)
50     {
51         stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
52         return 1;
53     }
54     return 0;
55 }
56 
57 auto helpText = q"EOS
58 Synopsis: tsv-filter [options] [file...]
59 
60 Filter tab-delimited files for matching lines via comparison tests against
61 individual fields. Use '--help-verbose' for a more detailed description.
62 
63 Global options:
64   --help-verbose      Print full help.
65   --help-options      Print the options list by itself.
66   --V|version         Print version information and exit.
67   --H|header          Treat the first line of each file as a header.
68   --or                Evaluate tests as an OR rather than an AND clause.
69   --v|invert          Invert the filter, printing lines that do not match.
70   --d|delimiter CHR   Field delimiter. Default: TAB.
71 
72 Operators:
73 * Test if a field is empty (no characters) or blank (empty or whitespace only).
74   Syntax:  --empty|not-empty|blank|not-blank  FIELD
75   Example: --empty 5          // True if field 5 is empty
76 
77 * Test if a field is numeric, finite, NaN, or infinity
78   Syntax:  --is-numeric|is-finite|is-nan|is-infinity FIELD
79   Example: --is-numeric 5 --gt 5:100  // Ensure field 5 is numeric before --gt test.
80 
81 * Compare a field to a number (integer or float)
82   Syntax:  --eq|ne|lt|le|gt|ge  FIELD:NUM
83   Example: --lt 5:1000 --gt 2:0.5  // True if (field 5 < 1000) and (field 2 > 0.5)
84 
85 * Compare a field to a string
86   Syntax:  --str-eq|str-ne  FIELD:STR
87   Example: --str-eq 3:abc        // True if field 3 is "abc"
88 
89 * Test if a field contains a string (substring search)
90   Syntax:  --str-in-fld|str-not-in-fld|istr-in-fld|istr-not-in-fld  FIELD:STR
91   Example: --str-in-fld 1:hello  // True if field 1 contains "hello"
92 
93 * Test if a field matches a regular expression.
94   Syntax:  --regex|iregex|not-regex|not-iregex  FIELD:REGEX
95   Example: --regex '3:ab*c'      // True if field 3 contains "ac", "abc", "abbc", etc.
96 
97 * Field to field comparisons - Similar to field vs literal comparisons, but field vs field.
98   Syntax:  --ff-eq|ff-ne|ff-lt|ff-le|ff-gt|ff-ge  FIELD1:FIELD2
99            --ff-str-eq|ff-str-ne|ff-istr-eq|ff-istr-ne  FIELD1:FIELD2
100   Example: --ff-eq 2:4           // True if fields 2 and 4 are numerically equivalent
101            --ff-str-eq 2:4       // True if fields 2 and 4 are the same strings
102 
103 * Field to field difference comparisons - Absolute and relative difference
104   Syntax:  --ff-absdiff-le|ff-absdiff-gt FIELD1:FIELD2:NUM
105            --ff-reldiff-le|ff-reldiff-gt FIELD1:FIELD2:NUM
106   Example: --ff-absdiff-lt 1:3:0.25   // True if abs(field1 - field2) < 0.25
107 
108 EOS";
109 
110 auto helpTextVerbose = q"EOS
111 Synopsis: tsv-filter [options] [file...]
112 
113 Filter lines of tab-delimited files via comparison tests against fields. Multiple
114 tests can be specified, by default they are evaluated as AND clause. Lines
115 satisfying the tests are written to standard output.
116 
117 Typical test syntax is '--op field:value', where 'op' is an operator, 'field' is a
118 1-based field index, and 'value' is the comparison basis. For example, '--lt 3:500'
119 tests if field 3 is less than 500. A more complete example:
120 
121   tsv-filter --header --gt 1:50 --lt 1:100 --le 2:1000 data.tsv
122 
123 This outputs all lines from file data.tsv where field 1 is greater than 50 and less
124 than 100, and field 2 is less than or equal to 1000. The header is also output.
125 
126 Tests available include:
127   * Test if a field is empty (no characters) or blank (empty or whitespace only).
128   * Test if a field is interpretable as a number, a finite number, NaN, or Infinity.
129   * Compare a field to a number - Numeric equality and relational tests.
130   * Compare a field to a string - String equality and relational tests.
131   * Test if a field matches a regular expression. Case sensitive or insensitive.
132   * Test if a field contains a string. Sub-string search, case sensitive or insensitive.
133   * Field to field comparisons - Similar to the other tests, except comparing
134     one field to another in the same line.
135 
136 Details:
137   * The run is aborted if there are not enough fields in an input line.
138   * Numeric tests will fail and abort the run if a field cannot be interpreted as a
139     number. This includes fields with no text. To avoid this use '--is-numeric' or
140     '--is-finite' prior to the numeric test. For example, '--is-numeric 5 --gt 5:100'
141     ensures field 5 is numeric before running the --gt test.
142   * Regular expression syntax is defined by the D programming language. They follow
143     common conventions (perl, python, etc.). Most common forms work as expected.
144 
145 Options:
146 EOS";
147 
148 auto helpTextOptions = q"EOS
149 Synopsis: tsv-filter [options] [file...]
150 
151 Options:
152 EOS";
153 
154 /**
155 The next blocks of code define the structure of the boolean tests run against input lines.
156 This includes function and delegate (closure) signatures, creation mechanisms, option
157 handlers, etc. Command line arg processing to build the test structure.
158 */
159 
160 /* FieldsPredicate delegate signature - Each input line is run against a set of boolean
161  * tests. Each test is a 'FieldsPredicate'. A FieldsPredicate is a delegate (closure)
162  * containing all info about the test except the field values of the line being tested.
163  * These delegates are created as part of command line arg processing. The wrapped data
164  * includes operation, field indexes, literal values, etc. At run-time the delegate is
165  * passed one argument, the split input line.
166  */
167 alias FieldsPredicate = bool delegate(const char[][] fields);
168 
169 /* FieldsPredicate function signatures - These aliases represent the different function
170  * signatures used in FieldsPredicate delegates. Each alias has a corresponding 'make'
171  * function. The 'make' function takes a real predicate function and closure args and
172  * returns a FieldsPredicate delegate. Predicates types are:
173  *
174  * - FieldUnaryPredicate - Test based on a single field. (e.g. --empty 4)
175  * - FieldVsNumberPredicate - Test based on a field index (used to get the field value)
176  *   and a fixed numeric value. For example, field 2 less than 100 (--lt 2:100).
177  * - FieldVsStringPredicate - Test based on a field and a string. (e.g. --str-eq 2:abc)
178  * - FieldVsIStringPredicate - Case-insensitive test based on a field and a string.
179  *   (e.g. --istr-eq 2:abc)
180  * - FieldVsRegexPredicate - Test based on a field and a regex. (e.g. --regex '2:ab*c')
181  * - FieldVsFieldPredicate - Test based on two fields. (e.g. --ff-le 2:4).
182  *
183  * An actual FieldsPredicate takes the fields from the line and the closure args and
184  * runs the test. For example, a function testing if a field is less than a specific
185  * value would pull the specified field from the fields array, convert the string to
186  * a number, then run the less-than test.
187  */
188 alias FieldUnaryPredicate    = bool function(const char[][] fields, size_t index);
189 alias FieldVsNumberPredicate = bool function(const char[][] fields, size_t index, double value);
190 alias FieldVsStringPredicate = bool function(const char[][] fields, size_t index, string value);
191 alias FieldVsIStringPredicate = bool function(const char[][] fields, size_t index, dstring value);
192 alias FieldVsRegexPredicate  = bool function(const char[][] fields, size_t index, Regex!char value);
193 alias FieldVsFieldPredicate  = bool function(const char[][] fields, size_t index1, size_t index2);
194 alias FieldFieldNumPredicate  = bool function(const char[][] fields, size_t index1, size_t index2, double value);
195 
196 FieldsPredicate makeFieldUnaryDelegate(FieldUnaryPredicate fn, size_t index)
197 {
198     return fields => fn(fields, index);
199 }
200 
201 FieldsPredicate makeFieldVsNumberDelegate(FieldVsNumberPredicate fn, size_t index, double value)
202 {
203     return fields => fn(fields, index, value);
204 }
205 
206 FieldsPredicate makeFieldVsStringDelegate(FieldVsStringPredicate fn, size_t index, string value)
207 {
208     return fields => fn(fields, index, value);
209 }
210 
211 FieldsPredicate makeFieldVsIStringDelegate(FieldVsIStringPredicate fn, size_t index, dstring value)
212 {
213     return fields => fn(fields, index, value);
214 }
215 
216 FieldsPredicate makeFieldVsRegexDelegate(FieldVsRegexPredicate fn, size_t index, Regex!char value)
217 {
218     return fields => fn(fields, index, value);
219 }
220 
221 FieldsPredicate makeFieldVsFieldDelegate(FieldVsFieldPredicate fn, size_t index1, size_t index2)
222 {
223     return fields => fn(fields, index1, index2);
224 }
225 
226 FieldsPredicate makeFieldFieldNumDelegate(FieldFieldNumPredicate fn, size_t index1, size_t index2, double value)
227 {
228     return fields => fn(fields, index1, index2, value);
229 }
230 
231 /* Predicate functions - These are the actual functions used in a FieldsPredicate. They
232  * are a direct reflection of the operators available via command line args. Each matches
233  * one of the FieldsPredicate function aliases defined above.
234  */
235 bool fldEmpty(const char[][] fields, size_t index) { return fields[index].length == 0; }
236 bool fldNotEmpty(const char[][] fields, size_t index) { return fields[index].length != 0; }
237 bool fldBlank(const char[][] fields, size_t index) { return cast(bool) fields[index].matchFirst(ctRegex!`^\s*$`); }
238 bool fldNotBlank(const char[][] fields, size_t index) { return !fields[index].matchFirst(ctRegex!`^\s*$`); }
239 
240 bool fldIsNumeric(const char[][] fields, size_t index) { return fields[index].isNumeric; }
241 bool fldIsFinite(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isFinite; }
242 bool fldIsNaN(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isNaN; }
243 bool fldIsInfinity(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isInfinity; }
244 
245 bool numLE(const char[][] fields, size_t index, double val) { return fields[index].to!double <= val; }
246 bool numLT(const char[][] fields, size_t index, double val) { return fields[index].to!double  < val; }
247 bool numGE(const char[][] fields, size_t index, double val) { return fields[index].to!double >= val; }
248 bool numGT(const char[][] fields, size_t index, double val) { return fields[index].to!double  > val; }
249 bool numEQ(const char[][] fields, size_t index, double val) { return fields[index].to!double == val; }
250 bool numNE(const char[][] fields, size_t index, double val) { return fields[index].to!double != val; }
251 
252 bool strLE(const char[][] fields, size_t index, string val) { return fields[index] <= val; }
253 bool strLT(const char[][] fields, size_t index, string val) { return fields[index]  < val; }
254 bool strGE(const char[][] fields, size_t index, string val) { return fields[index] >= val; }
255 bool strGT(const char[][] fields, size_t index, string val) { return fields[index]  > val; }
256 bool strEQ(const char[][] fields, size_t index, string val) { return fields[index] == val; }
257 bool strNE(const char[][] fields, size_t index, string val) { return fields[index] != val; }
258 bool strInFld(const char[][] fields, size_t index, string val) { return fields[index].canFind(val); }
259 bool strNotInFld(const char[][] fields, size_t index, string val) { return !fields[index].canFind(val); }
260 
261 /* Note: For istr predicates, the command line value has been lower-cased by fieldVsIStringOptionHander.
262  */
263 bool istrEQ(const char[][] fields, size_t index, dstring val) { return fields[index].asLowerCase.equal(val); }
264 bool istrNE(const char[][] fields, size_t index, dstring val) { return !fields[index].asLowerCase.equal(val); }
265 bool istrInFld(const char[][] fields, size_t index, dstring val) { return fields[index].asLowerCase.canFind(val); }
266 bool istrNotInFld(const char[][] fields, size_t index, dstring val) { return !fields[index].asLowerCase.canFind(val); }
267 
268 /* Note: Case-sensitivity is built into the regex value, so these regex predicates are
269  * used for both case-sensitive and case-insensitive regex operators.
270  */
271 bool regexMatch(const char[][] fields, size_t index, Regex!char val) { return cast(bool) fields[index].matchFirst(val); }
272 bool regexNotMatch(const char[][] fields, size_t index, Regex!char val) { return !fields[index].matchFirst(val); }
273 
274 bool ffLE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double <= fields[index2].to!double; }
275 bool ffLT(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double  < fields[index2].to!double; }
276 bool ffGE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double >= fields[index2].to!double; }
277 bool ffGT(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double  > fields[index2].to!double; }
278 bool ffEQ(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double == fields[index2].to!double; }
279 bool ffNE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double != fields[index2].to!double; }
280 bool ffStrEQ(const char[][] fields, size_t index1, size_t index2) { return fields[index1] == fields[index2]; }
281 bool ffStrNE(const char[][] fields, size_t index1, size_t index2) { return fields[index1] != fields[index2]; }
282 bool ffIStrEQ(const char[][] fields, size_t index1, size_t index2)
283 {
284     return equal(fields[index1].asLowerCase, fields[index2].asLowerCase);
285 }
286 bool ffIStrNE(const char[][] fields, size_t index1, size_t index2)
287 {
288     return !equal(fields[index1].asLowerCase, fields[index2].asLowerCase);
289 }
290 
291 auto AbsDiff(double v1, double v2) { return (v1 - v2).abs; }
292 auto RelDiff(double v1, double v2) { return (v1 - v2).abs / min(v1.abs, v2.abs); }
293 
294 bool ffAbsDiffLE(const char[][] fields, size_t index1, size_t index2, double value)
295 {
296     return AbsDiff(fields[index1].to!double, fields[index2].to!double) <= value;
297 }
298 bool ffAbsDiffGT(const char[][] fields, size_t index1, size_t index2, double value)
299 {
300     return AbsDiff(fields[index1].to!double, fields[index2].to!double) > value;
301 }
302 bool ffRelDiffLE(const char[][] fields, size_t index1, size_t index2, double value)
303 {
304     return RelDiff(fields[index1].to!double, fields[index2].to!double) <= value;
305 }
306 bool ffRelDiffGT(const char[][] fields, size_t index1, size_t index2, double value)
307 {
308     return RelDiff(fields[index1].to!double, fields[index2].to!double) > value;
309 }
310 
311 /* Command line option handlers - There is a command line option handler for each
312  * predicate type. That is, one each for FieldUnaryPredicate, FieldVsNumberPredicate,
313  * etc. Option handlers are passed the tests array, the predicate function, and the
314  * command line option arguments. A FieldsPredicate delegate is created and appended to
315  * the tests array. An exception is thrown if errors are detected while processing the
316  * option, the error text is intended for the end user.
317  *
318  * These option handlers have similar functionality, differing in option processing and
319  * error message generation. fieldVsNumberOptionHandler is described as an example. It
320  * handles command options such as '--lt 3:1000', which tests field 3 for a values less
321  * than 1000. It is passed the tests array, the 'numLE' function to use for the test, and
322  * the string "3:1000" representing the option value. It parses the option value into
323  * field index (unsigned int) and value (double). These are wrapped in a FieldsPredicate
324  * which is added to the tests array. An error is signaled if the option string is invalid.
325  *
326  * During processing, fields indexes are converted from one-based to zero-based. As an
327  * optimization, the maximum field index is also tracked. This allows early termination of
328  * line splitting.
329  */
330 
331 void fieldUnaryOptionHandler(
332     ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldUnaryPredicate fn, string option, string optionVal)
333 {
334     size_t field;
335     try field = optionVal.to!size_t;
336     catch (Exception exc)
337     {
338         throw new Exception(
339             format("Invalid value in option: '--%s %s'. Expected: '--%s <field>' where field is a 1-upped integer.",
340                    option, optionVal, option));
341     }
342 
343     if (field == 0)
344     {
345         throw new Exception(
346             format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal));
347     }
348 
349     size_t zeroBasedIndex = field - 1;
350     tests ~= makeFieldUnaryDelegate(fn, zeroBasedIndex);
351     maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex;
352 }
353 
354 void fieldVsNumberOptionHandler(
355     ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsNumberPredicate fn, string option, string optionVal)
356 {
357     auto valSplit = findSplit(optionVal, ":");
358     if (valSplit[1].length == 0 || valSplit[2].length == 0)
359     {
360         throw new Exception(
361             format("Invalid option: '%s %s'. Expected: '%s <field>:<val>' where <field> and <val> are numbers.",
362                    option, optionVal, option));
363     }
364     size_t field;
365     double value;
366     try
367     {
368         field = valSplit[0].to!size_t;
369         value = valSplit[2].to!double;
370     }
371     catch (Exception exc)
372     {
373         throw new Exception(
374             format("Invalid numeric values in option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> and <val> are numbers.",
375                    option, optionVal, option));
376     }
377 
378     if (field == 0)
379     {
380         throw new Exception(
381             format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal));
382     }
383     size_t zeroBasedIndex = field - 1;
384     tests ~= makeFieldVsNumberDelegate(fn, zeroBasedIndex, value);
385     maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex;
386 }
387 
388 void fieldVsStringOptionHandler(
389     ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsStringPredicate fn, string option, string optionVal)
390 {
391     auto valSplit = findSplit(optionVal, ":");
392     if (valSplit[1].length == 0 || valSplit[2].length == 0)
393     {
394         throw new Exception(
395             format("Invalid option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> is a string.",
396                    option, optionVal, option));
397     }
398     size_t field;
399     string value;
400     try
401     {
402         field = valSplit[0].to!size_t;
403         value = valSplit[2].to!string;
404     }
405     catch (Exception exc)
406     {
407         throw new Exception(
408             format("Invalid values in option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> a string.",
409                    option, optionVal, option));
410     }
411 
412     if (field == 0)
413     {
414         throw new Exception(
415             format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal));
416     }
417     size_t zeroBasedIndex = field - 1;
418     tests ~= makeFieldVsStringDelegate(fn, zeroBasedIndex, value);
419     maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex;
420 }
421 
422 /* The fieldVsIStringOptionHandler lower-cases the command line argument, assuming the
423  * case-insensitive comparison will be done on lower-cased values.
424  */
425 void fieldVsIStringOptionHandler(
426     ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsIStringPredicate fn, string option, string optionVal)
427 {
428     auto valSplit = findSplit(optionVal, ":");
429     if (valSplit[1].length == 0 || valSplit[2].length == 0)
430     {
431         throw new Exception(
432             format("Invalid option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> is a string.",
433                    option, optionVal, option));
434     }
435     size_t field;
436     string value;
437     try
438     {
439         field = valSplit[0].to!size_t;
440         value = valSplit[2].to!string;
441     }
442     catch (Exception exc)
443     {
444         throw new Exception(
445             format("Invalid values in option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> a string.",
446                    option, optionVal, option));
447     }
448 
449     if (field == 0)
450     {
451         throw new Exception(
452             format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal));
453     }
454     size_t zeroBasedIndex = field - 1;
455     tests ~= makeFieldVsIStringDelegate(fn, zeroBasedIndex, value.to!dstring.toLower);
456     maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex;
457 }
458 
459 void fieldVsRegexOptionHandler(
460     ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsRegexPredicate fn, string option, string optionVal,
461     bool caseSensitive)
462 {
463     auto valSplit = findSplit(optionVal, ":");
464     if (valSplit[1].length == 0 || valSplit[2].length == 0)
465     {
466         throw new Exception(
467             format("Invalid option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> is a regular expression.",
468                    option, optionVal, option));
469     }
470     size_t field;
471     Regex!char value;
472     try
473     {
474         auto modifiers = caseSensitive ? "" : "i";
475         field = valSplit[0].to!size_t;
476         value = regex(valSplit[2], modifiers);
477     }
478     catch (Exception exc)
479     {
480         throw new Exception(
481             format("Invalid values in option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> is a regular expression.",
482                    option, optionVal, option));
483     }
484 
485     if (field == 0)
486     {
487         throw new Exception(
488             format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal));
489     }
490     size_t zeroBasedIndex = field - 1;
491     tests ~= makeFieldVsRegexDelegate(fn, zeroBasedIndex, value);
492     maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex;
493 }
494 
495 void fieldVsFieldOptionHandler(
496     ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsFieldPredicate fn, string option, string optionVal)
497 {
498     auto valSplit = findSplit(optionVal, ":");
499     if (valSplit[1].length == 0 || valSplit[2].length == 0)
500     {
501         throw new Exception(
502             format("Invalid option: '--%s %s'. Expected: '--%s <field1>:<field2>' where fields are 1-upped integers.",
503                    option, optionVal, option));
504     }
505     size_t field1;
506     size_t field2;
507     try
508     {
509         field1 = valSplit[0].to!size_t;
510         field2 = valSplit[2].to!size_t;
511     }
512     catch (Exception exc)
513     {
514         throw new Exception(
515             format("Invalid values in option: '--%s %s'. Expected: '--%s <field1>:<field2>' where fields are 1-upped integers.",
516                    option, optionVal, option));
517     }
518 
519     if (field1 == 0 || field2 == 0)
520     {
521         throw new Exception(
522             format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal));
523     }
524 
525     if (field1 == field2)
526     {
527         throw new Exception(
528             format("Invalid option: '--%s %s'. Field1 and field2 must be different fields", option, optionVal));
529     }
530 
531     size_t zeroBasedIndex1 = field1 - 1;
532     size_t zeroBasedIndex2 = field2 - 1;
533     tests ~= makeFieldVsFieldDelegate(fn, zeroBasedIndex1, zeroBasedIndex2);
534     maxFieldIndex = max(maxFieldIndex, zeroBasedIndex1, zeroBasedIndex2);
535 }
536 
537 
538 void fieldFieldNumOptionHandler(
539     ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldFieldNumPredicate fn, string option, string optionVal)
540 {
541     size_t field1;
542     size_t field2;
543     double value;
544     auto valSplit = findSplit(optionVal, ":");
545     auto invalidOption = (valSplit[1].length == 0 || valSplit[2].length == 0);
546 
547     if (!invalidOption)
548     {
549         auto valSplit2 = findSplit(valSplit[2], ":");
550         invalidOption = (valSplit2[1].length == 0 || valSplit2[2].length == 0);
551 
552         if (!invalidOption)
553         {
554             try
555             {
556                 field1 = valSplit[0].to!size_t;
557                 field2 = valSplit2[0].to!size_t;
558                 value = valSplit2[2].to!double;
559             }
560             catch (Exception exc)
561             {
562                 invalidOption = true;
563             }
564         }
565     }
566 
567     if (invalidOption)
568     {
569         throw new Exception(
570             format("Invalid values in option: '--%s %s'. Expected: '--%s <field1>:<field2>:<num>' where fields are 1-upped integers.",
571                    option, optionVal, option));
572     }
573     if (field1 == 0 || field2 == 0)
574     {
575         throw new Exception(
576             format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal));
577     }
578     if (field1 == field2)
579     {
580         throw new Exception(
581             format("Invalid option: '--%s %s'. Field1 and field2 must be different fields", option, optionVal));
582     }
583 
584     size_t zeroBasedIndex1 = field1 - 1;
585     size_t zeroBasedIndex2 = field2 - 1;
586     tests ~= makeFieldFieldNumDelegate(fn, zeroBasedIndex1, zeroBasedIndex2, value);
587     maxFieldIndex = max(maxFieldIndex, zeroBasedIndex1, zeroBasedIndex2);
588 }
589 
590 /* Command line options - This struct holds the results of command line option processing.
591  * It also has a method, processArgs, that invokes command line arg processing.
592  */
593 struct TsvFilterOptions
594 {
595     string programName;
596     FieldsPredicate[] tests;         // Derived from tests
597     size_t maxFieldIndex;            // Derived from tests
598     bool hasHeader = false;          // --H|header
599     bool invert = false;             // --invert
600     bool disjunct = false;           // --or
601     char delim = '\t';               // --delimiter
602     bool helpVerbose = false;        // --help-verbose
603     bool helpOptions = false;        // --help-options
604     bool versionWanted = false;      // --V|version
605 
606     /* Returns a tuple. First value is true if command line arguments were successfully
607      * processed and execution should continue, or false if an error occurred or the user
608      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
609      *
610      * Returning true (execution continues) means args have been validated and the
611      * tests array has been established.
612      */
613     auto processArgs (ref string[] cmdArgs)
614     {
615         import std.getopt;
616         import std.path : baseName, stripExtension;
617         import getopt_inorder;
618 
619         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
620 
621         /* Command option handlers - One handler for each option. These conform to the
622          * getopt required handler signature, and separate knowledge the specific command
623          * option text from the option processing.
624          */
625         void handlerFldEmpty(string option, string value)    { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldEmpty,    option, value); }
626         void handlerFldNotEmpty(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldNotEmpty, option, value); }
627         void handlerFldBlank(string option, string value)    { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldBlank,    option, value); }
628         void handlerFldNotBlank(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldNotBlank, option, value); }
629 
630         void handlerFldIsNumeric(string option, string value)  { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsNumeric, option, value); }
631         void handlerFldIsFinite(string option, string value)   { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsFinite, option, value); }
632         void handlerFldIsNaN(string option, string value)      { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsNaN, option, value); }
633         void handlerFldIsInfinity(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsInfinity, option, value); }
634 
635         void handlerNumLE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numLE, option, value); }
636         void handlerNumLT(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numLT, option, value); }
637         void handlerNumGE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numGE, option, value); }
638         void handlerNumGT(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numGT, option, value); }
639         void handlerNumEQ(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numEQ, option, value); }
640         void handlerNumNE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numNE, option, value); }
641 
642         void handlerStrLE(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strLE, option, value); }
643         void handlerStrLT(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strLT, option, value); }
644         void handlerStrGE(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strGE, option, value); }
645         void handlerStrGT(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strGT, option, value); }
646         void handlerStrEQ(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strEQ, option, value); }
647         void handlerStrNE(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strNE, option, value); }
648 
649         void handlerStrInFld(string option, string value)    { fieldVsStringOptionHandler(tests, maxFieldIndex, &strInFld,    option, value); }
650         void handlerStrNotInFld(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strNotInFld, option, value); }
651 
652         void handlerIStrEQ(string option, string value)       { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrEQ,       option, value); }
653         void handlerIStrNE(string option, string value)       { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrNE,       option, value); }
654         void handlerIStrInFld(string option, string value)    { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrInFld,    option, value); }
655         void handlerIStrNotInFld(string option, string value) { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrNotInFld, option, value); }
656 
657         void handlerRegexMatch(string option, string value)     { fieldVsRegexOptionHandler(tests, maxFieldIndex, &regexMatch,    option, value, true); }
658         void handlerRegexNotMatch(string option, string value)  { fieldVsRegexOptionHandler(tests, maxFieldIndex, &regexNotMatch, option, value, true); }
659         void handlerIRegexMatch(string option, string value)    { fieldVsRegexOptionHandler(tests, maxFieldIndex, &regexMatch,    option, value, false); }
660         void handlerIRegexNotMatch(string option, string value) { fieldVsRegexOptionHandler(tests, maxFieldIndex, &regexNotMatch, option, value, false); }
661 
662         void handlerFFLE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffLE, option, value); }
663         void handlerFFLT(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffLT, option, value); }
664         void handlerFFGE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffGE, option, value); }
665         void handlerFFGT(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffGT, option, value); }
666         void handlerFFEQ(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffEQ, option, value); }
667         void handlerFFNE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffNE, option, value); }
668 
669         void handlerFFStrEQ(string option, string value)  { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffStrEQ,  option, value); }
670         void handlerFFStrNE(string option, string value)  { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffStrNE,  option, value); }
671         void handlerFFIStrEQ(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffIStrEQ, option, value); }
672         void handlerFFIStrNE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffIStrNE, option, value); }
673 
674         void handlerFFAbsDiffLE(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffAbsDiffLE, option, value); }
675         void handlerFFAbsDiffGT(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffAbsDiffGT, option, value); }
676         void handlerFFRelDiffLE(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffRelDiffLE, option, value); }
677         void handlerFFRelDiffGT(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffRelDiffGT, option, value); }
678 
679         try
680         {
681             arraySep = ",";    // Use comma to separate values in command line options
682             auto r = getoptInorder(
683                 cmdArgs,
684                 "help-verbose",    "     Print full help.", &helpVerbose,
685                 "help-options",    "     Print the options list by itself.", &helpOptions,
686                  std.getopt.config.caseSensitive,
687                 "V|version",       "     Print version information and exit.", &versionWanted,
688                 "H|header",        "     Treat the first line of each file as a header.", &hasHeader,
689                 std.getopt.config.caseInsensitive,
690                 "or",              "     Evaluate tests as an OR rather than an AND.", &disjunct,
691                 std.getopt.config.caseSensitive,
692                 "v|invert",        "     Invert the filter, printing lines that do not match.", &invert,
693                 std.getopt.config.caseInsensitive,
694                 "d|delimiter",     "CHR  Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim,
695 
696                 "empty",           "FIELD       True if field is empty.", &handlerFldEmpty,
697                 "not-empty",       "FIELD       True if field is not empty.", &handlerFldNotEmpty,
698                 "blank",           "FIELD       True if field is empty or all whitespace.", &handlerFldBlank,
699                 "not-blank",       "FIELD       True if field contains a non-whitespace character.", &handlerFldNotBlank,
700 
701                 "is-numeric",      "FIELD       True if field is interpretable as a number.", &handlerFldIsNumeric,
702                 "is-finite",       "FIELD       True if field is interpretable as a number and is not NaN or infinity.", &handlerFldIsFinite,
703                 "is-nan",          "FIELD       True if field is NaN.", &handlerFldIsNaN,
704                 "is-infinity",     "FIELD       True if field is infinity.", &handlerFldIsInfinity,
705 
706                 "le",              "FIELD:NUM   FIELD <= NUM (numeric).", &handlerNumLE,
707                 "lt",              "FIELD:NUM   FIELD <  NUM (numeric).", &handlerNumLT,
708                 "ge",              "FIELD:NUM   FIELD >= NUM (numeric).", &handlerNumGE,
709                 "gt",              "FIELD:NUM   FIELD >  NUM (numeric).", &handlerNumGT,
710                 "eq",              "FIELD:NUM   FIELD == NUM (numeric).", &handlerNumEQ,
711                 "ne",              "FIELD:NUM   FIELD != NUM (numeric).", &handlerNumNE,
712 
713                 "str-le",          "FIELD:STR   FIELD <= STR (string).", &handlerStrLE,
714                 "str-lt",          "FIELD:STR   FIELD <  STR (string).", &handlerStrLT,
715                 "str-ge",          "FIELD:STR   FIELD >= STR (string).", &handlerStrGE,
716                 "str-gt",          "FIELD:STR   FIELD >  STR (string).", &handlerStrGT,
717                 "str-eq",          "FIELD:STR   FIELD == STR (string).", &handlerStrEQ,
718                 "istr-eq",         "FIELD:STR   FIELD == STR (string, case-insensitive).", &handlerIStrEQ,
719                 "str-ne",          "FIELD:STR   FIELD != STR (string).", &handlerStrNE,
720                 "istr-ne",         "FIELD:STR   FIELD != STR (string, case-insensitive).", &handlerIStrNE,
721                 "str-in-fld",      "FIELD:STR   FIELD contains STR (substring search).", &handlerStrInFld,
722                 "istr-in-fld",     "FIELD:STR   FIELD contains STR (substring search, case-insensitive).", &handlerIStrInFld,
723                 "str-not-in-fld",  "FIELD:STR   FIELD does not contain STR (substring search).", &handlerStrNotInFld,
724                 "istr-not-in-fld", "FIELD:STR   FIELD does not contain STR (substring search, case-insensitive).", &handlerIStrNotInFld,
725 
726                 "regex",           "FIELD:REGEX   FIELD matches regular expression.", &handlerRegexMatch,
727                 "iregex",          "FIELD:REGEX   FIELD matches regular expression, case-insensitive.", &handlerIRegexMatch,
728                 "not-regex",       "FIELD:REGEX   FIELD does not match regular expression.", &handlerRegexNotMatch,
729                 "not-iregex",      "FIELD:REGEX   FIELD does not match regular expression, case-insensitive.", &handlerIRegexNotMatch,
730 
731                 "ff-le",           "FIELD1:FIELD2   FIELD1 <= FIELD2 (numeric).", &handlerFFLE,
732                 "ff-lt",           "FIELD1:FIELD2   FIELD1 <  FIELD2 (numeric).", &handlerFFLT,
733                 "ff-ge",           "FIELD1:FIELD2   FIELD1 >= FIELD2 (numeric).", &handlerFFGE,
734                 "ff-gt",           "FIELD1:FIELD2   FIELD1 >  FIELD2 (numeric).", &handlerFFGT,
735                 "ff-eq",           "FIELD1:FIELD2   FIELD1 == FIELD2 (numeric).", &handlerFFEQ,
736                 "ff-ne",           "FIELD1:FIELD2   FIELD1 != FIELD2 (numeric).", &handlerFFNE,
737                 "ff-str-eq",       "FIELD1:FIELD2   FIELD1 == FIELD2 (string).", &handlerFFStrEQ,
738                 "ff-istr-eq",      "FIELD1:FIELD2   FIELD1 == FIELD2 (string, case-insensitive).", &handlerFFIStrEQ,
739                 "ff-str-ne",       "FIELD1:FIELD2   FIELD1 != FIELD2 (string).", &handlerFFStrNE,
740                 "ff-istr-ne",      "FIELD1:FIELD2   FIELD1 != FIELD2 (string, case-insensitive).", &handlerFFIStrNE,
741 
742                 "ff-absdiff-le",   "FIELD1:FIELD2:NUM   abs(FIELD1 - FIELD2) <= NUM", &handlerFFAbsDiffLE,
743                 "ff-absdiff-gt",   "FIELD1:FIELD2:NUM   abs(FIELD1 - FIELD2)  > NUM", &handlerFFAbsDiffGT,
744                 "ff-reldiff-le",   "FIELD1:FIELD2:NUM   abs(FIELD1 - FIELD2) / min(abs(FIELD1), abs(FIELD2)) <= NUM", &handlerFFRelDiffLE,
745                 "ff-reldiff-gt",   "FIELD1:FIELD2:NUM   abs(FIELD1 - FIELD2) / min(abs(FIELD1), abs(FIELD2))  > NUM", &handlerFFRelDiffGT,
746                 );
747 
748             /* Both help texts are a bit long. In this case, for "regular" help, don't
749              * print options, just the text. The text summarizes the options.
750              */
751             if (r.helpWanted)
752             {
753                 stdout.write(helpText);
754                 return tuple(false, 0);
755             }
756             else if (helpVerbose)
757             {
758                 defaultGetoptPrinter(helpTextVerbose, r.options);
759                 return tuple(false, 0);
760             }
761             else if (helpOptions)
762             {
763                 defaultGetoptPrinter(helpTextOptions, r.options);
764                 return tuple(false, 0);
765             }
766             else if (versionWanted)
767             {
768                 import tsvutils_version;
769                 writeln(tsvutilsVersionNotice("tsv-filter"));
770                 return tuple(false, 0);
771             }
772         }
773         catch (Exception exc)
774         {
775             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
776             return tuple(false, 1);
777         }
778         return tuple(true, 0);
779     }
780 }
781 
782 /** tsvFilter processes the input files and runs the tests.
783  */
784 void tsvFilter(in TsvFilterOptions cmdopt, in string[] inputFiles)
785 {
786     import std.algorithm : all, any, splitter;
787     import std.range;
788     import tsvutil : BufferedOutputRange, throwIfWindowsNewlineOnUnix;
789 
790     /* BufferedOutputRange improves performance on narrow files with high percentages of
791      * writes. Want responsive output if output is rare, so ensure the first matched
792      * line is written, and that writes separated by long stretches of non-matched lines
793      * are written.
794      */
795     enum maxInputLinesWithoutBufferFlush = 1024;
796     size_t inputLinesWithoutBufferFlush = maxInputLinesWithoutBufferFlush + 1;
797 
798     auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
799 
800     /* Process each input file, one line at a time. */
801     auto lineFields = new char[][](cmdopt.maxFieldIndex + 1);
802     bool headerWritten = false;
803     foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"])
804     {
805         auto inputStream = (filename == "-") ? stdin : filename.File();
806         foreach (lineNum, line; inputStream.byLine.enumerate(1))
807         {
808             if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, lineNum);
809             if (lineNum == 1 && cmdopt.hasHeader)
810             {
811                 /* Header. Output on the first file, skip subsequent files. */
812                 if (!headerWritten)
813                 {
814                     bufferedOutput.appendln(line);
815                     headerWritten = true;
816                 }
817             }
818             else
819             {
820                 /* Copy the needed number of fields to the fields array. */
821                 int fieldIndex = -1;
822                 foreach (fieldValue; line.splitter(cmdopt.delim))
823                 {
824                     if (fieldIndex == cast(long) cmdopt.maxFieldIndex) break;
825                     fieldIndex++;
826                     lineFields[fieldIndex] = fieldValue;
827                 }
828 
829                 if (fieldIndex == -1)
830                 {
831                     assert(line.length == 0);
832                     /* Bug work-around. Currently empty lines are not handled properly by splitter.
833                      *   Bug: https://issues.dlang.org/show_bug.cgi?id=15735
834                      *   Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030
835                      * Work-around: Point to the line. It's an empty string.
836                      */
837                     fieldIndex++;
838                     lineFields[fieldIndex] = line;
839                 }
840 
841                 if (fieldIndex < cast(long) cmdopt.maxFieldIndex)
842                 {
843                     throw new Exception(
844                         format("Not enough fields in line. File: %s, Line: %s",
845                                (filename == "-") ? "Standard Input" : filename, lineNum));
846                 }
847 
848                 /* Run the tests. Tests will fail (throw) if a field cannot be converted
849                  * to the expected type.
850                  */
851                 try
852                 {
853                     inputLinesWithoutBufferFlush++;
854                     bool passed = cmdopt.disjunct ?
855                         cmdopt.tests.any!(x => x(lineFields)) :
856                         cmdopt.tests.all!(x => x(lineFields));
857                     if (cmdopt.invert) passed = !passed;
858                     if (passed)
859                     {
860                         bool wasFlushed = bufferedOutput.appendln(line);
861                         if (wasFlushed) inputLinesWithoutBufferFlush = 0;
862                         else if (inputLinesWithoutBufferFlush > maxInputLinesWithoutBufferFlush)
863                         {
864                             bufferedOutput.flush;
865                             inputLinesWithoutBufferFlush = 0;
866                         }
867                     }
868                 }
869                 catch (Exception exc)
870                 {
871                     throw new Exception(
872                         format("Could not process line or field: %s\n  File: %s Line: %s%s",
873                                exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum,
874                                (lineNum == 1) ? "\n  Is this a header line? Use --header to skip." : ""));
875                 }
876             }
877         }
878     }
879 }