1 /**
2 Command line tool that filters TSV files.
3 
4 This tool filters tab-delimited files based on numeric or string comparisons
5 against specific fields. See the helpText string for details.
6 
7 Copyright (c) 2015-2018, eBay Software Foundation
8 Initially written by Jon Degenhardt
9 
10 License: Boost Licence 1.0 (http://boost.org/LICENSE_1_0.txt)
11 */
12 module tsv_filter;
13 
14 import std.algorithm : canFind, equal, findSplit, max, min;
15 import std.conv : to;
16 import std.format : format;
17 import std.math : abs, isFinite, isInfinity, isNaN;
18 import std.regex;
19 import std.stdio;
20 import std.string : isNumeric;
21 import std.typecons : tuple;
22 import std.uni: asLowerCase, toLower;
23 
24 /* The program has two main parts, command line arg processing and processing the input
25  * files. Much of the work is in command line arg processing. This sets up the tests run
26  * against each input line. The tests are an array of delegates (closures) run against the
27  * fields in the line. The tests are based on command line arguments, of which there is
28  * a lengthy set, one for each test.
29  */
30 
31 /** Main program. Invokes command line arg processing and tsv-filter to perform
32  * the real work. Any errors are caught and reported.
33  */
34 int main(string[] cmdArgs)
35 {
36     /* When running in DMD code coverage mode, turn on report merging. */
37     version(D_Coverage) version(DigitalMars)
38     {
39         import core.runtime : dmd_coverSetMerge;
40         dmd_coverSetMerge(true);
41     }
42 
43     TsvFilterOptions cmdopt;
44     auto r = cmdopt.processArgs(cmdArgs);
45     if (!r[0]) return r[1];
46     version(LDC_Profile)
47     {
48         import ldc.profile : resetAll;
49         resetAll();
50     }
51     try tsvFilter(cmdopt, cmdArgs[1..$]);
52     catch (Exception exc)
53     {
54         stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
55         return 1;
56     }
57     return 0;
58 }
59 
60 auto helpText = q"EOS
61 Synopsis: tsv-filter [options] [file...]
62 
63 Filter tab-delimited files for matching lines via comparison tests against
64 individual fields. Use '--help-verbose' for a more detailed description.
65 
66 Global options:
67   --help-verbose      Print full help.
68   --help-options      Print the options list by itself.
69   --V|version         Print version information and exit.
70   --H|header          Treat the first line of each file as a header.
71   --or                Evaluate tests as an OR rather than an AND clause.
72   --v|invert          Invert the filter, printing lines that do not match.
73   --d|delimiter CHR   Field delimiter. Default: TAB.
74 
75 Operators:
76 * Test if a field is empty (no characters) or blank (empty or whitespace only).
77   Syntax:  --empty|not-empty|blank|not-blank  FIELD
78   Example: --empty 5          // True if field 5 is empty
79 
80 * Test if a field is numeric, finite, NaN, or infinity
81   Syntax:  --is-numeric|is-finite|is-nan|is-infinity FIELD
82   Example: --is-numeric 5 --gt 5:100  // Ensure field 5 is numeric before --gt test.
83 
84 * Compare a field to a number (integer or float)
85   Syntax:  --eq|ne|lt|le|gt|ge  FIELD:NUM
86   Example: --lt 5:1000 --gt 2:0.5  // True if (field 5 < 1000) and (field 2 > 0.5)
87 
88 * Compare a field to a string
89   Syntax:  --str-eq|str-ne  FIELD:STR
90   Example: --str-eq 3:abc        // True if field 3 is "abc"
91 
92 * Test if a field contains a string (substring search)
93   Syntax:  --str-in-fld|str-not-in-fld|istr-in-fld|istr-not-in-fld  FIELD:STR
94   Example: --str-in-fld 1:hello  // True if field 1 contains "hello"
95 
96 * Test if a field matches a regular expression.
97   Syntax:  --regex|iregex|not-regex|not-iregex  FIELD:REGEX
98   Example: --regex '3:ab*c'      // True if field 3 contains "ac", "abc", "abbc", etc.
99 
100 * Field to field comparisons - Similar to field vs literal comparisons, but field vs field.
101   Syntax:  --ff-eq|ff-ne|ff-lt|ff-le|ff-gt|ff-ge  FIELD1:FIELD2
102            --ff-str-eq|ff-str-ne|ff-istr-eq|ff-istr-ne  FIELD1:FIELD2
103   Example: --ff-eq 2:4           // True if fields 2 and 4 are numerically equivalent
104            --ff-str-eq 2:4       // True if fields 2 and 4 are the same strings
105 
106 * Field to field difference comparisons - Absolute and relative difference
107   Syntax:  --ff-absdiff-le|ff-absdiff-gt FIELD1:FIELD2:NUM
108            --ff-reldiff-le|ff-reldiff-gt FIELD1:FIELD2:NUM
109   Example: --ff-absdiff-lt 1:3:0.25   // True if abs(field1 - field2) < 0.25
110 
111 EOS";
112 
113 auto helpTextVerbose = q"EOS
114 Synopsis: tsv-filter [options] [file...]
115 
116 Filter lines of tab-delimited files via comparison tests against fields. Multiple
117 tests can be specified, by default they are evaluated as AND clause. Lines
118 satisfying the tests are written to standard output.
119 
120 Typical test syntax is '--op field:value', where 'op' is an operator, 'field' is a
121 1-based field index, and 'value' is the comparison basis. For example, '--lt 3:500'
122 tests if field 3 is less than 500. A more complete example:
123 
124   tsv-filter --header --gt 1:50 --lt 1:100 --le 2:1000 data.tsv
125 
126 This outputs all lines from file data.tsv where field 1 is greater than 50 and less
127 than 100, and field 2 is less than or equal to 1000. The header is also output.
128 
129 Tests available include:
130   * Test if a field is empty (no characters) or blank (empty or whitespace only).
131   * Test if a field is interpretable as a number, a finite number, NaN, or Infinity.
132   * Compare a field to a number - Numeric equality and relational tests.
133   * Compare a field to a string - String equality and relational tests.
134   * Test if a field matches a regular expression. Case sensitive or insensitive.
135   * Test if a field contains a string. Sub-string search, case sensitive or insensitive.
136   * Field to field comparisons - Similar to the other tests, except comparing
137     one field to another in the same line.
138 
139 Details:
140   * The run is aborted if there are not enough fields in an input line.
141   * Numeric tests will fail and abort the run if a field cannot be interpreted as a
142     number. This includes fields with no text. To avoid this use '--is-numeric' or
143     '--is-finite' prior to the numeric test. For example, '--is-numeric 5 --gt 5:100'
144     ensures field 5 is numeric before running the --gt test.
145   * Regular expression syntax is defined by the D programming language. They follow
146     common conventions (perl, python, etc.). Most common forms work as expected.
147 
148 Options:
149 EOS";
150 
151 auto helpTextOptions = q"EOS
152 Synopsis: tsv-filter [options] [file...]
153 
154 Options:
155 EOS";
156 
157 /* The next blocks of code define the structure of the boolean tests run against input lines.
158  * This includes function and delegate (closure) signatures, creation mechanisms, option
159  * handlers, etc. Command line arg processing to build the test structure.
160 */
161 
162 /* FieldsPredicate delegate signature - Each input line is run against a set of boolean
163  * tests. Each test is a 'FieldsPredicate'. A FieldsPredicate is a delegate (closure)
164  * containing all info about the test except the field values of the line being tested.
165  * These delegates are created as part of command line arg processing. The wrapped data
166  * includes operation, field indexes, literal values, etc. At run-time the delegate is
167  * passed one argument, the split input line.
168  */
169 alias FieldsPredicate = bool delegate(const char[][] fields);
170 
171 /* FieldsPredicate function signatures - These aliases represent the different function
172  * signatures used in FieldsPredicate delegates. Each alias has a corresponding 'make'
173  * function. The 'make' function takes a real predicate function and closure args and
174  * returns a FieldsPredicate delegate. Predicates types are:
175  *
176  * - FieldUnaryPredicate - Test based on a single field. (e.g. --empty 4)
177  * - FieldVsNumberPredicate - Test based on a field index (used to get the field value)
178  *   and a fixed numeric value. For example, field 2 less than 100 (--lt 2:100).
179  * - FieldVsStringPredicate - Test based on a field and a string. (e.g. --str-eq 2:abc)
180  * - FieldVsIStringPredicate - Case-insensitive test based on a field and a string.
181  *   (e.g. --istr-eq 2:abc)
182  * - FieldVsRegexPredicate - Test based on a field and a regex. (e.g. --regex '2:ab*c')
183  * - FieldVsFieldPredicate - Test based on two fields. (e.g. --ff-le 2:4).
184  *
185  * An actual FieldsPredicate takes the fields from the line and the closure args and
186  * runs the test. For example, a function testing if a field is less than a specific
187  * value would pull the specified field from the fields array, convert the string to
188  * a number, then run the less-than test.
189  */
190 alias FieldUnaryPredicate    = bool function(const char[][] fields, size_t index);
191 alias FieldVsNumberPredicate = bool function(const char[][] fields, size_t index, double value);
192 alias FieldVsStringPredicate = bool function(const char[][] fields, size_t index, string value);
193 alias FieldVsIStringPredicate = bool function(const char[][] fields, size_t index, dstring value);
194 alias FieldVsRegexPredicate  = bool function(const char[][] fields, size_t index, Regex!char value);
195 alias FieldVsFieldPredicate  = bool function(const char[][] fields, size_t index1, size_t index2);
196 alias FieldFieldNumPredicate  = bool function(const char[][] fields, size_t index1, size_t index2, double value);
197 
198 FieldsPredicate makeFieldUnaryDelegate(FieldUnaryPredicate fn, size_t index)
199 {
200     return fields => fn(fields, index);
201 }
202 
203 FieldsPredicate makeFieldVsNumberDelegate(FieldVsNumberPredicate fn, size_t index, double value)
204 {
205     return fields => fn(fields, index, value);
206 }
207 
208 FieldsPredicate makeFieldVsStringDelegate(FieldVsStringPredicate fn, size_t index, string value)
209 {
210     return fields => fn(fields, index, value);
211 }
212 
213 FieldsPredicate makeFieldVsIStringDelegate(FieldVsIStringPredicate fn, size_t index, dstring value)
214 {
215     return fields => fn(fields, index, value);
216 }
217 
218 FieldsPredicate makeFieldVsRegexDelegate(FieldVsRegexPredicate fn, size_t index, Regex!char value)
219 {
220     return fields => fn(fields, index, value);
221 }
222 
223 FieldsPredicate makeFieldVsFieldDelegate(FieldVsFieldPredicate fn, size_t index1, size_t index2)
224 {
225     return fields => fn(fields, index1, index2);
226 }
227 
228 FieldsPredicate makeFieldFieldNumDelegate(FieldFieldNumPredicate fn, size_t index1, size_t index2, double value)
229 {
230     return fields => fn(fields, index1, index2, value);
231 }
232 
233 /* Predicate functions - These are the actual functions used in a FieldsPredicate. They
234  * are a direct reflection of the operators available via command line args. Each matches
235  * one of the FieldsPredicate function aliases defined above.
236  */
237 bool fldEmpty(const char[][] fields, size_t index) { return fields[index].length == 0; }
238 bool fldNotEmpty(const char[][] fields, size_t index) { return fields[index].length != 0; }
239 bool fldBlank(const char[][] fields, size_t index) { return cast(bool) fields[index].matchFirst(ctRegex!`^\s*$`); }
240 bool fldNotBlank(const char[][] fields, size_t index) { return !fields[index].matchFirst(ctRegex!`^\s*$`); }
241 
242 bool fldIsNumeric(const char[][] fields, size_t index) { return fields[index].isNumeric; }
243 bool fldIsFinite(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isFinite; }
244 bool fldIsNaN(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isNaN; }
245 bool fldIsInfinity(const char[][] fields, size_t index) { return fields[index].isNumeric && fields[index].to!double.isInfinity; }
246 
247 bool numLE(const char[][] fields, size_t index, double val) { return fields[index].to!double <= val; }
248 bool numLT(const char[][] fields, size_t index, double val) { return fields[index].to!double  < val; }
249 bool numGE(const char[][] fields, size_t index, double val) { return fields[index].to!double >= val; }
250 bool numGT(const char[][] fields, size_t index, double val) { return fields[index].to!double  > val; }
251 bool numEQ(const char[][] fields, size_t index, double val) { return fields[index].to!double == val; }
252 bool numNE(const char[][] fields, size_t index, double val) { return fields[index].to!double != val; }
253 
254 bool strLE(const char[][] fields, size_t index, string val) { return fields[index] <= val; }
255 bool strLT(const char[][] fields, size_t index, string val) { return fields[index]  < val; }
256 bool strGE(const char[][] fields, size_t index, string val) { return fields[index] >= val; }
257 bool strGT(const char[][] fields, size_t index, string val) { return fields[index]  > val; }
258 bool strEQ(const char[][] fields, size_t index, string val) { return fields[index] == val; }
259 bool strNE(const char[][] fields, size_t index, string val) { return fields[index] != val; }
260 bool strInFld(const char[][] fields, size_t index, string val) { return fields[index].canFind(val); }
261 bool strNotInFld(const char[][] fields, size_t index, string val) { return !fields[index].canFind(val); }
262 
263 /* Note: For istr predicates, the command line value has been lower-cased by fieldVsIStringOptionHander.
264  */
265 bool istrEQ(const char[][] fields, size_t index, dstring val) { return fields[index].asLowerCase.equal(val); }
266 bool istrNE(const char[][] fields, size_t index, dstring val) { return !fields[index].asLowerCase.equal(val); }
267 bool istrInFld(const char[][] fields, size_t index, dstring val) { return fields[index].asLowerCase.canFind(val); }
268 bool istrNotInFld(const char[][] fields, size_t index, dstring val) { return !fields[index].asLowerCase.canFind(val); }
269 
270 /* Note: Case-sensitivity is built into the regex value, so these regex predicates are
271  * used for both case-sensitive and case-insensitive regex operators.
272  */
273 bool regexMatch(const char[][] fields, size_t index, Regex!char val) { return cast(bool) fields[index].matchFirst(val); }
274 bool regexNotMatch(const char[][] fields, size_t index, Regex!char val) { return !fields[index].matchFirst(val); }
275 
276 bool ffLE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double <= fields[index2].to!double; }
277 bool ffLT(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double  < fields[index2].to!double; }
278 bool ffGE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double >= fields[index2].to!double; }
279 bool ffGT(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double  > fields[index2].to!double; }
280 bool ffEQ(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double == fields[index2].to!double; }
281 bool ffNE(const char[][] fields, size_t index1, size_t index2) { return fields[index1].to!double != fields[index2].to!double; }
282 bool ffStrEQ(const char[][] fields, size_t index1, size_t index2) { return fields[index1] == fields[index2]; }
283 bool ffStrNE(const char[][] fields, size_t index1, size_t index2) { return fields[index1] != fields[index2]; }
284 bool ffIStrEQ(const char[][] fields, size_t index1, size_t index2)
285 {
286     return equal(fields[index1].asLowerCase, fields[index2].asLowerCase);
287 }
288 bool ffIStrNE(const char[][] fields, size_t index1, size_t index2)
289 {
290     return !equal(fields[index1].asLowerCase, fields[index2].asLowerCase);
291 }
292 
293 auto AbsDiff(double v1, double v2) { return (v1 - v2).abs; }
294 auto RelDiff(double v1, double v2) { return (v1 - v2).abs / min(v1.abs, v2.abs); }
295 
296 bool ffAbsDiffLE(const char[][] fields, size_t index1, size_t index2, double value)
297 {
298     return AbsDiff(fields[index1].to!double, fields[index2].to!double) <= value;
299 }
300 bool ffAbsDiffGT(const char[][] fields, size_t index1, size_t index2, double value)
301 {
302     return AbsDiff(fields[index1].to!double, fields[index2].to!double) > value;
303 }
304 bool ffRelDiffLE(const char[][] fields, size_t index1, size_t index2, double value)
305 {
306     return RelDiff(fields[index1].to!double, fields[index2].to!double) <= value;
307 }
308 bool ffRelDiffGT(const char[][] fields, size_t index1, size_t index2, double value)
309 {
310     return RelDiff(fields[index1].to!double, fields[index2].to!double) > value;
311 }
312 
313 /* Command line option handlers - There is a command line option handler for each
314  * predicate type. That is, one each for FieldUnaryPredicate, FieldVsNumberPredicate,
315  * etc. Option handlers are passed the tests array, the predicate function, and the
316  * command line option arguments. A FieldsPredicate delegate is created and appended to
317  * the tests array. An exception is thrown if errors are detected while processing the
318  * option, the error text is intended for the end user.
319  *
320  * These option handlers have similar functionality, differing in option processing and
321  * error message generation. fieldVsNumberOptionHandler is described as an example. It
322  * handles command options such as '--lt 3:1000', which tests field 3 for a values less
323  * than 1000. It is passed the tests array, the 'numLE' function to use for the test, and
324  * the string "3:1000" representing the option value. It parses the option value into
325  * field index (unsigned int) and value (double). These are wrapped in a FieldsPredicate
326  * which is added to the tests array. An error is signaled if the option string is invalid.
327  *
328  * During processing, fields indexes are converted from one-based to zero-based. As an
329  * optimization, the maximum field index is also tracked. This allows early termination of
330  * line splitting.
331  */
332 
333 void fieldUnaryOptionHandler(
334     ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldUnaryPredicate fn, string option, string optionVal)
335 {
336     size_t field;
337     try field = optionVal.to!size_t;
338     catch (Exception exc)
339     {
340         throw new Exception(
341             format("Invalid value in option: '--%s %s'. Expected: '--%s <field>' where field is a 1-upped integer.",
342                    option, optionVal, option));
343     }
344 
345     if (field == 0)
346     {
347         throw new Exception(
348             format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal));
349     }
350 
351     size_t zeroBasedIndex = field - 1;
352     tests ~= makeFieldUnaryDelegate(fn, zeroBasedIndex);
353     maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex;
354 }
355 
356 void fieldVsNumberOptionHandler(
357     ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsNumberPredicate fn, string option, string optionVal)
358 {
359     auto valSplit = findSplit(optionVal, ":");
360     if (valSplit[1].length == 0 || valSplit[2].length == 0)
361     {
362         throw new Exception(
363             format("Invalid option: '%s %s'. Expected: '%s <field>:<val>' where <field> and <val> are numbers.",
364                    option, optionVal, option));
365     }
366     size_t field;
367     double value;
368     try
369     {
370         field = valSplit[0].to!size_t;
371         value = valSplit[2].to!double;
372     }
373     catch (Exception exc)
374     {
375         throw new Exception(
376             format("Invalid numeric values in option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> and <val> are numbers.",
377                    option, optionVal, option));
378     }
379 
380     if (field == 0)
381     {
382         throw new Exception(
383             format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal));
384     }
385     size_t zeroBasedIndex = field - 1;
386     tests ~= makeFieldVsNumberDelegate(fn, zeroBasedIndex, value);
387     maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex;
388 }
389 
390 void fieldVsStringOptionHandler(
391     ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsStringPredicate fn, string option, string optionVal)
392 {
393     auto valSplit = findSplit(optionVal, ":");
394     if (valSplit[1].length == 0 || valSplit[2].length == 0)
395     {
396         throw new Exception(
397             format("Invalid option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> is a string.",
398                    option, optionVal, option));
399     }
400     size_t field;
401     string value;
402     try
403     {
404         field = valSplit[0].to!size_t;
405         value = valSplit[2].to!string;
406     }
407     catch (Exception exc)
408     {
409         throw new Exception(
410             format("Invalid values in option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> a string.",
411                    option, optionVal, option));
412     }
413 
414     if (field == 0)
415     {
416         throw new Exception(
417             format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal));
418     }
419     size_t zeroBasedIndex = field - 1;
420     tests ~= makeFieldVsStringDelegate(fn, zeroBasedIndex, value);
421     maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex;
422 }
423 
424 /* The fieldVsIStringOptionHandler lower-cases the command line argument, assuming the
425  * case-insensitive comparison will be done on lower-cased values.
426  */
427 void fieldVsIStringOptionHandler(
428     ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsIStringPredicate fn, string option, string optionVal)
429 {
430     auto valSplit = findSplit(optionVal, ":");
431     if (valSplit[1].length == 0 || valSplit[2].length == 0)
432     {
433         throw new Exception(
434             format("Invalid option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> is a string.",
435                    option, optionVal, option));
436     }
437     size_t field;
438     string value;
439     try
440     {
441         field = valSplit[0].to!size_t;
442         value = valSplit[2].to!string;
443     }
444     catch (Exception exc)
445     {
446         throw new Exception(
447             format("Invalid values in option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> a string.",
448                    option, optionVal, option));
449     }
450 
451     if (field == 0)
452     {
453         throw new Exception(
454             format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal));
455     }
456     size_t zeroBasedIndex = field - 1;
457     tests ~= makeFieldVsIStringDelegate(fn, zeroBasedIndex, value.to!dstring.toLower);
458     maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex;
459 }
460 
461 void fieldVsRegexOptionHandler(
462     ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsRegexPredicate fn, string option, string optionVal,
463     bool caseSensitive)
464 {
465     auto valSplit = findSplit(optionVal, ":");
466     if (valSplit[1].length == 0 || valSplit[2].length == 0)
467     {
468         throw new Exception(
469             format("Invalid option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> is a regular expression.",
470                    option, optionVal, option));
471     }
472     size_t field;
473     Regex!char value;
474     try
475     {
476         auto modifiers = caseSensitive ? "" : "i";
477         field = valSplit[0].to!size_t;
478         value = regex(valSplit[2], modifiers);
479     }
480     catch (Exception exc)
481     {
482         throw new Exception(
483             format("Invalid values in option: '--%s %s'. Expected: '--%s <field>:<val>' where <field> is a number and <val> is a regular expression.",
484                    option, optionVal, option));
485     }
486 
487     if (field == 0)
488     {
489         throw new Exception(
490             format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal));
491     }
492     size_t zeroBasedIndex = field - 1;
493     tests ~= makeFieldVsRegexDelegate(fn, zeroBasedIndex, value);
494     maxFieldIndex = (zeroBasedIndex > maxFieldIndex) ? zeroBasedIndex : maxFieldIndex;
495 }
496 
497 void fieldVsFieldOptionHandler(
498     ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldVsFieldPredicate fn, string option, string optionVal)
499 {
500     auto valSplit = findSplit(optionVal, ":");
501     if (valSplit[1].length == 0 || valSplit[2].length == 0)
502     {
503         throw new Exception(
504             format("Invalid option: '--%s %s'. Expected: '--%s <field1>:<field2>' where fields are 1-upped integers.",
505                    option, optionVal, option));
506     }
507     size_t field1;
508     size_t field2;
509     try
510     {
511         field1 = valSplit[0].to!size_t;
512         field2 = valSplit[2].to!size_t;
513     }
514     catch (Exception exc)
515     {
516         throw new Exception(
517             format("Invalid values in option: '--%s %s'. Expected: '--%s <field1>:<field2>' where fields are 1-upped integers.",
518                    option, optionVal, option));
519     }
520 
521     if (field1 == 0 || field2 == 0)
522     {
523         throw new Exception(
524             format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal));
525     }
526 
527     if (field1 == field2)
528     {
529         throw new Exception(
530             format("Invalid option: '--%s %s'. Field1 and field2 must be different fields", option, optionVal));
531     }
532 
533     size_t zeroBasedIndex1 = field1 - 1;
534     size_t zeroBasedIndex2 = field2 - 1;
535     tests ~= makeFieldVsFieldDelegate(fn, zeroBasedIndex1, zeroBasedIndex2);
536     maxFieldIndex = max(maxFieldIndex, zeroBasedIndex1, zeroBasedIndex2);
537 }
538 
539 
540 void fieldFieldNumOptionHandler(
541     ref FieldsPredicate[] tests, ref size_t maxFieldIndex, FieldFieldNumPredicate fn, string option, string optionVal)
542 {
543     size_t field1;
544     size_t field2;
545     double value;
546     auto valSplit = findSplit(optionVal, ":");
547     auto invalidOption = (valSplit[1].length == 0 || valSplit[2].length == 0);
548 
549     if (!invalidOption)
550     {
551         auto valSplit2 = findSplit(valSplit[2], ":");
552         invalidOption = (valSplit2[1].length == 0 || valSplit2[2].length == 0);
553 
554         if (!invalidOption)
555         {
556             try
557             {
558                 field1 = valSplit[0].to!size_t;
559                 field2 = valSplit2[0].to!size_t;
560                 value = valSplit2[2].to!double;
561             }
562             catch (Exception exc)
563             {
564                 invalidOption = true;
565             }
566         }
567     }
568 
569     if (invalidOption)
570     {
571         throw new Exception(
572             format("Invalid values in option: '--%s %s'. Expected: '--%s <field1>:<field2>:<num>' where fields are 1-upped integers.",
573                    option, optionVal, option));
574     }
575     if (field1 == 0 || field2 == 0)
576     {
577         throw new Exception(
578             format("Invalid option: '--%s %s'. Zero is not a valid field index.", option, optionVal));
579     }
580     if (field1 == field2)
581     {
582         throw new Exception(
583             format("Invalid option: '--%s %s'. Field1 and field2 must be different fields", option, optionVal));
584     }
585 
586     size_t zeroBasedIndex1 = field1 - 1;
587     size_t zeroBasedIndex2 = field2 - 1;
588     tests ~= makeFieldFieldNumDelegate(fn, zeroBasedIndex1, zeroBasedIndex2, value);
589     maxFieldIndex = max(maxFieldIndex, zeroBasedIndex1, zeroBasedIndex2);
590 }
591 
592 /** Command line options - This struct holds the results of command line option processing.
593  * It also has a method, processArgs, that invokes command line arg processing.
594  */
595 struct TsvFilterOptions
596 {
597     string programName;
598     FieldsPredicate[] tests;         // Derived from tests
599     size_t maxFieldIndex;            // Derived from tests
600     bool hasHeader = false;          // --H|header
601     bool invert = false;             // --invert
602     bool disjunct = false;           // --or
603     char delim = '\t';               // --delimiter
604     bool helpVerbose = false;        // --help-verbose
605     bool helpOptions = false;        // --help-options
606     bool versionWanted = false;      // --V|version
607 
608     /* Returns a tuple. First value is true if command line arguments were successfully
609      * processed and execution should continue, or false if an error occurred or the user
610      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
611      *
612      * Returning true (execution continues) means args have been validated and the
613      * tests array has been established.
614      */
615     auto processArgs (ref string[] cmdArgs)
616     {
617         import std.getopt;
618         import std.path : baseName, stripExtension;
619         import getopt_inorder;
620 
621         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
622 
623         /* Command option handlers - One handler for each option. These conform to the
624          * getopt required handler signature, and separate knowledge the specific command
625          * option text from the option processing.
626          */
627         void handlerFldEmpty(string option, string value)    { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldEmpty,    option, value); }
628         void handlerFldNotEmpty(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldNotEmpty, option, value); }
629         void handlerFldBlank(string option, string value)    { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldBlank,    option, value); }
630         void handlerFldNotBlank(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldNotBlank, option, value); }
631 
632         void handlerFldIsNumeric(string option, string value)  { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsNumeric, option, value); }
633         void handlerFldIsFinite(string option, string value)   { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsFinite, option, value); }
634         void handlerFldIsNaN(string option, string value)      { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsNaN, option, value); }
635         void handlerFldIsInfinity(string option, string value) { fieldUnaryOptionHandler(tests, maxFieldIndex, &fldIsInfinity, option, value); }
636 
637         void handlerNumLE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numLE, option, value); }
638         void handlerNumLT(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numLT, option, value); }
639         void handlerNumGE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numGE, option, value); }
640         void handlerNumGT(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numGT, option, value); }
641         void handlerNumEQ(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numEQ, option, value); }
642         void handlerNumNE(string option, string value) { fieldVsNumberOptionHandler(tests, maxFieldIndex, &numNE, option, value); }
643 
644         void handlerStrLE(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strLE, option, value); }
645         void handlerStrLT(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strLT, option, value); }
646         void handlerStrGE(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strGE, option, value); }
647         void handlerStrGT(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strGT, option, value); }
648         void handlerStrEQ(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strEQ, option, value); }
649         void handlerStrNE(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strNE, option, value); }
650 
651         void handlerStrInFld(string option, string value)    { fieldVsStringOptionHandler(tests, maxFieldIndex, &strInFld,    option, value); }
652         void handlerStrNotInFld(string option, string value) { fieldVsStringOptionHandler(tests, maxFieldIndex, &strNotInFld, option, value); }
653 
654         void handlerIStrEQ(string option, string value)       { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrEQ,       option, value); }
655         void handlerIStrNE(string option, string value)       { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrNE,       option, value); }
656         void handlerIStrInFld(string option, string value)    { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrInFld,    option, value); }
657         void handlerIStrNotInFld(string option, string value) { fieldVsIStringOptionHandler(tests, maxFieldIndex, &istrNotInFld, option, value); }
658 
659         void handlerRegexMatch(string option, string value)     { fieldVsRegexOptionHandler(tests, maxFieldIndex, &regexMatch,    option, value, true); }
660         void handlerRegexNotMatch(string option, string value)  { fieldVsRegexOptionHandler(tests, maxFieldIndex, &regexNotMatch, option, value, true); }
661         void handlerIRegexMatch(string option, string value)    { fieldVsRegexOptionHandler(tests, maxFieldIndex, &regexMatch,    option, value, false); }
662         void handlerIRegexNotMatch(string option, string value) { fieldVsRegexOptionHandler(tests, maxFieldIndex, &regexNotMatch, option, value, false); }
663 
664         void handlerFFLE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffLE, option, value); }
665         void handlerFFLT(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffLT, option, value); }
666         void handlerFFGE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffGE, option, value); }
667         void handlerFFGT(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffGT, option, value); }
668         void handlerFFEQ(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffEQ, option, value); }
669         void handlerFFNE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffNE, option, value); }
670 
671         void handlerFFStrEQ(string option, string value)  { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffStrEQ,  option, value); }
672         void handlerFFStrNE(string option, string value)  { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffStrNE,  option, value); }
673         void handlerFFIStrEQ(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffIStrEQ, option, value); }
674         void handlerFFIStrNE(string option, string value) { fieldVsFieldOptionHandler(tests, maxFieldIndex, &ffIStrNE, option, value); }
675 
676         void handlerFFAbsDiffLE(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffAbsDiffLE, option, value); }
677         void handlerFFAbsDiffGT(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffAbsDiffGT, option, value); }
678         void handlerFFRelDiffLE(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffRelDiffLE, option, value); }
679         void handlerFFRelDiffGT(string option, string value) { fieldFieldNumOptionHandler(tests, maxFieldIndex, &ffRelDiffGT, option, value); }
680 
681         try
682         {
683             arraySep = ",";    // Use comma to separate values in command line options
684             auto r = getoptInorder(
685                 cmdArgs,
686                 "help-verbose",    "     Print full help.", &helpVerbose,
687                 "help-options",    "     Print the options list by itself.", &helpOptions,
688                  std.getopt.config.caseSensitive,
689                 "V|version",       "     Print version information and exit.", &versionWanted,
690                 "H|header",        "     Treat the first line of each file as a header.", &hasHeader,
691                 std.getopt.config.caseInsensitive,
692                 "or",              "     Evaluate tests as an OR rather than an AND.", &disjunct,
693                 std.getopt.config.caseSensitive,
694                 "v|invert",        "     Invert the filter, printing lines that do not match.", &invert,
695                 std.getopt.config.caseInsensitive,
696                 "d|delimiter",     "CHR  Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &delim,
697 
698                 "empty",           "FIELD       True if field is empty.", &handlerFldEmpty,
699                 "not-empty",       "FIELD       True if field is not empty.", &handlerFldNotEmpty,
700                 "blank",           "FIELD       True if field is empty or all whitespace.", &handlerFldBlank,
701                 "not-blank",       "FIELD       True if field contains a non-whitespace character.", &handlerFldNotBlank,
702 
703                 "is-numeric",      "FIELD       True if field is interpretable as a number.", &handlerFldIsNumeric,
704                 "is-finite",       "FIELD       True if field is interpretable as a number and is not NaN or infinity.", &handlerFldIsFinite,
705                 "is-nan",          "FIELD       True if field is NaN.", &handlerFldIsNaN,
706                 "is-infinity",     "FIELD       True if field is infinity.", &handlerFldIsInfinity,
707 
708                 "le",              "FIELD:NUM   FIELD <= NUM (numeric).", &handlerNumLE,
709                 "lt",              "FIELD:NUM   FIELD <  NUM (numeric).", &handlerNumLT,
710                 "ge",              "FIELD:NUM   FIELD >= NUM (numeric).", &handlerNumGE,
711                 "gt",              "FIELD:NUM   FIELD >  NUM (numeric).", &handlerNumGT,
712                 "eq",              "FIELD:NUM   FIELD == NUM (numeric).", &handlerNumEQ,
713                 "ne",              "FIELD:NUM   FIELD != NUM (numeric).", &handlerNumNE,
714 
715                 "str-le",          "FIELD:STR   FIELD <= STR (string).", &handlerStrLE,
716                 "str-lt",          "FIELD:STR   FIELD <  STR (string).", &handlerStrLT,
717                 "str-ge",          "FIELD:STR   FIELD >= STR (string).", &handlerStrGE,
718                 "str-gt",          "FIELD:STR   FIELD >  STR (string).", &handlerStrGT,
719                 "str-eq",          "FIELD:STR   FIELD == STR (string).", &handlerStrEQ,
720                 "istr-eq",         "FIELD:STR   FIELD == STR (string, case-insensitive).", &handlerIStrEQ,
721                 "str-ne",          "FIELD:STR   FIELD != STR (string).", &handlerStrNE,
722                 "istr-ne",         "FIELD:STR   FIELD != STR (string, case-insensitive).", &handlerIStrNE,
723                 "str-in-fld",      "FIELD:STR   FIELD contains STR (substring search).", &handlerStrInFld,
724                 "istr-in-fld",     "FIELD:STR   FIELD contains STR (substring search, case-insensitive).", &handlerIStrInFld,
725                 "str-not-in-fld",  "FIELD:STR   FIELD does not contain STR (substring search).", &handlerStrNotInFld,
726                 "istr-not-in-fld", "FIELD:STR   FIELD does not contain STR (substring search, case-insensitive).", &handlerIStrNotInFld,
727 
728                 "regex",           "FIELD:REGEX   FIELD matches regular expression.", &handlerRegexMatch,
729                 "iregex",          "FIELD:REGEX   FIELD matches regular expression, case-insensitive.", &handlerIRegexMatch,
730                 "not-regex",       "FIELD:REGEX   FIELD does not match regular expression.", &handlerRegexNotMatch,
731                 "not-iregex",      "FIELD:REGEX   FIELD does not match regular expression, case-insensitive.", &handlerIRegexNotMatch,
732 
733                 "ff-le",           "FIELD1:FIELD2   FIELD1 <= FIELD2 (numeric).", &handlerFFLE,
734                 "ff-lt",           "FIELD1:FIELD2   FIELD1 <  FIELD2 (numeric).", &handlerFFLT,
735                 "ff-ge",           "FIELD1:FIELD2   FIELD1 >= FIELD2 (numeric).", &handlerFFGE,
736                 "ff-gt",           "FIELD1:FIELD2   FIELD1 >  FIELD2 (numeric).", &handlerFFGT,
737                 "ff-eq",           "FIELD1:FIELD2   FIELD1 == FIELD2 (numeric).", &handlerFFEQ,
738                 "ff-ne",           "FIELD1:FIELD2   FIELD1 != FIELD2 (numeric).", &handlerFFNE,
739                 "ff-str-eq",       "FIELD1:FIELD2   FIELD1 == FIELD2 (string).", &handlerFFStrEQ,
740                 "ff-istr-eq",      "FIELD1:FIELD2   FIELD1 == FIELD2 (string, case-insensitive).", &handlerFFIStrEQ,
741                 "ff-str-ne",       "FIELD1:FIELD2   FIELD1 != FIELD2 (string).", &handlerFFStrNE,
742                 "ff-istr-ne",      "FIELD1:FIELD2   FIELD1 != FIELD2 (string, case-insensitive).", &handlerFFIStrNE,
743 
744                 "ff-absdiff-le",   "FIELD1:FIELD2:NUM   abs(FIELD1 - FIELD2) <= NUM", &handlerFFAbsDiffLE,
745                 "ff-absdiff-gt",   "FIELD1:FIELD2:NUM   abs(FIELD1 - FIELD2)  > NUM", &handlerFFAbsDiffGT,
746                 "ff-reldiff-le",   "FIELD1:FIELD2:NUM   abs(FIELD1 - FIELD2) / min(abs(FIELD1), abs(FIELD2)) <= NUM", &handlerFFRelDiffLE,
747                 "ff-reldiff-gt",   "FIELD1:FIELD2:NUM   abs(FIELD1 - FIELD2) / min(abs(FIELD1), abs(FIELD2))  > NUM", &handlerFFRelDiffGT,
748                 );
749 
750             /* Both help texts are a bit long. In this case, for "regular" help, don't
751              * print options, just the text. The text summarizes the options.
752              */
753             if (r.helpWanted)
754             {
755                 stdout.write(helpText);
756                 return tuple(false, 0);
757             }
758             else if (helpVerbose)
759             {
760                 defaultGetoptPrinter(helpTextVerbose, r.options);
761                 return tuple(false, 0);
762             }
763             else if (helpOptions)
764             {
765                 defaultGetoptPrinter(helpTextOptions, r.options);
766                 return tuple(false, 0);
767             }
768             else if (versionWanted)
769             {
770                 import tsvutils_version;
771                 writeln(tsvutilsVersionNotice("tsv-filter"));
772                 return tuple(false, 0);
773             }
774         }
775         catch (Exception exc)
776         {
777             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
778             return tuple(false, 1);
779         }
780         return tuple(true, 0);
781     }
782 }
783 
784 /** tsvFilter processes the input files and runs the tests.
785  */
786 void tsvFilter(in TsvFilterOptions cmdopt, in string[] inputFiles)
787 {
788     import std.algorithm : all, any, splitter;
789     import std.range;
790     import tsvutil : BufferedOutputRange, throwIfWindowsNewlineOnUnix;
791 
792     /* BufferedOutputRange improves performance on narrow files with high percentages of
793      * writes. Want responsive output if output is rare, so ensure the first matched
794      * line is written, and that writes separated by long stretches of non-matched lines
795      * are written.
796      */
797     enum maxInputLinesWithoutBufferFlush = 1024;
798     size_t inputLinesWithoutBufferFlush = maxInputLinesWithoutBufferFlush + 1;
799 
800     auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
801 
802     /* Process each input file, one line at a time. */
803     auto lineFields = new char[][](cmdopt.maxFieldIndex + 1);
804     bool headerWritten = false;
805     foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"])
806     {
807         auto inputStream = (filename == "-") ? stdin : filename.File();
808         foreach (lineNum, line; inputStream.byLine.enumerate(1))
809         {
810             if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, lineNum);
811             if (lineNum == 1 && cmdopt.hasHeader)
812             {
813                 /* Header. Output on the first file, skip subsequent files. */
814                 if (!headerWritten)
815                 {
816                     bufferedOutput.appendln(line);
817                     headerWritten = true;
818                 }
819             }
820             else
821             {
822                 /* Copy the needed number of fields to the fields array. */
823                 int fieldIndex = -1;
824                 foreach (fieldValue; line.splitter(cmdopt.delim))
825                 {
826                     if (fieldIndex == cast(long) cmdopt.maxFieldIndex) break;
827                     fieldIndex++;
828                     lineFields[fieldIndex] = fieldValue;
829                 }
830 
831                 if (fieldIndex == -1)
832                 {
833                     assert(line.length == 0);
834                     /* Bug work-around. Currently empty lines are not handled properly by splitter.
835                      *   Bug: https://issues.dlang.org/show_bug.cgi?id=15735
836                      *   Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030
837                      * Work-around: Point to the line. It's an empty string.
838                      */
839                     fieldIndex++;
840                     lineFields[fieldIndex] = line;
841                 }
842 
843                 if (fieldIndex < cast(long) cmdopt.maxFieldIndex)
844                 {
845                     throw new Exception(
846                         format("Not enough fields in line. File: %s, Line: %s",
847                                (filename == "-") ? "Standard Input" : filename, lineNum));
848                 }
849 
850                 /* Run the tests. Tests will fail (throw) if a field cannot be converted
851                  * to the expected type.
852                  */
853                 try
854                 {
855                     inputLinesWithoutBufferFlush++;
856                     bool passed = cmdopt.disjunct ?
857                         cmdopt.tests.any!(x => x(lineFields)) :
858                         cmdopt.tests.all!(x => x(lineFields));
859                     if (cmdopt.invert) passed = !passed;
860                     if (passed)
861                     {
862                         bool wasFlushed = bufferedOutput.appendln(line);
863                         if (wasFlushed) inputLinesWithoutBufferFlush = 0;
864                         else if (inputLinesWithoutBufferFlush > maxInputLinesWithoutBufferFlush)
865                         {
866                             bufferedOutput.flush;
867                             inputLinesWithoutBufferFlush = 0;
868                         }
869                     }
870                 }
871                 catch (Exception exc)
872                 {
873                     throw new Exception(
874                         format("Could not process line or field: %s\n  File: %s Line: %s%s",
875                                exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum,
876                                (lineNum == 1) ? "\n  Is this a header line? Use --header to skip." : ""));
877                 }
878             }
879         }
880     }
881 }