tsv_summarize source code

1 /**
2 Command line tool that reads TSV files and summarizes field values associated with
3 equivalent keys.
4 
5 Copyright (c) 2016-2018, eBay Software Foundation
6 Initially written by Jon Degenhardt
7 
8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
9 */
10 module tsv_summarize;
11 
12 import std.algorithm : all, any, canFind, each, find, findSplit, map, joiner, splitter;
13 import std.array : join;
14 import std.conv : to;
15 import std.format : format;
16 import std.range;
17 import std.stdio;
18 import std.typecons : tuple;
19 import std.container : DList;
20 
21 version(unittest)
22 {
23     // When running unit tests, use main from -main compiler switch.
24 }
25 else
26 {
27     int main(string[] cmdArgs)
28     {
29         /* When running in DMD code coverage mode, turn on report merging. */
30         version(D_Coverage) version(DigitalMars)
31         {
32             import core.runtime : dmd_coverSetMerge;
33             dmd_coverSetMerge(true);
34         }
35 
36         TsvSummarizeOptions cmdopt;
37         auto r = cmdopt.processArgs(cmdArgs);
38         if (!r[0]) return r[1];
39         version(LDC_Profile)
40         {
41             import ldc.profile : resetAll;
42             resetAll();
43         }
44         try tsvSummarize(cmdopt, cmdArgs[1..$]);
45         catch (Exception exc)
46         {
47             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
48             return 1;
49         }
50         return 0;
51     }
52 }
53 
54 auto helpTextVerbose = q"EOS
55 Synopsis: tsv-summarize [options] file [file...]
56 
57 tsv-summarize reads tabular data files (tab-separated by default), tracks
58 field values for each unique key, and runs summarization algorithms. Consider
59 the file data.tsv:
60 
61    make    color   time
62    ford    blue    131
63    chevy   green   124
64    ford    red     128
65    bmw     black   118
66    bmw     black   126
67    ford    blue    122
68 
69 The min and average times for each make is generated by the command:
70 
71    $ tsv-summarize --header --group-by 1 --min 3 --mean 3 data.tsv
72 
73 This produces:
74 
75    make   time_min time_mean
76    ford   122      127
77    chevy  124      124
78    bmw    118      122
79 
80 Using '--group 1,2' will group by both 'make' and 'color'. Omitting the
81 '--group-by' entirely summarizes fields for full file.
82 
83 The program tries to generate useful headers, but custom headers can be
84 specified. Example (using -g and -H shortcuts for --header and --group-by):
85 
86    $ tsv-summarize -H -g 1 --min 3:fastest --mean 3:average data.tsv
87 
88 Most operators take custom headers in a similarly way, generally following:
89 
90   --<operator-name> FIELD[:header]
91 
92 Operators can be specified multiple times. They can also take multiple
93 fields (though not when a custom header is specified). Examples:
94 
95   --median 2,3,4
96   --median 2-5,7-11
97 
98 The quantile operator requires one or more probabilities after the fields:
99 
100   --quantile 2:0.25                // Quantile 1 of field 2
101   --quantile 2-4:0.25,0.5,0.75     // Q1, Median, Q3 of fields 2, 3, 4
102 
103 Summarization operators available are:
104   count       range        mad            values
105   retain      sum          var            unique-values
106   first       mean         stddev         unique-count
107   last        median       mode           missing-count
108   min         quantile     mode-count     not-missing-count
109   max
110 
111 Numeric values are printed to 12 significant digits by default. This can be
112 changed using the '--p|float-precision' option. If six or less it sets the
113 number of significant digits after the decimal point. If greater than six it
114 sets the total number of significant digits.
115 
116 Calculations hold onto the minimum data needed while reading data. A few
117 operations like median keep all data values in memory. These operations will
118 start to encounter performance issues as available memory becomes scarce. The
119 size that can be handled effectively is machine dependent, but often quite
120 large files can be handled.
121 
122 Operations requiring numeric entries will signal an error and terminate
123 processing if a non-numeric entry is found.
124 
125 Missing values are not treated specially by default, this can be changed
126 using the '--x|exclude-missing' or '--r|replace-missing' option. The former
127 turns off processing for missing values, the latter uses a replacement value.
128 
129 Options:
130 EOS";
131 
132 auto helpText = q"EOS
133 Synopsis: tsv-summarize [options] file [file...]
134 
135 tsv-summarize runs aggregation operations on fields in tab-separated value
136 files. Operations can be run against the full input data or grouped by key
137 fields. Use --help-verbose for more extensive help.
138 
139 Options:
140 EOS";
141 
142 /** Command line options - Container and processing. The processArgs method is used to
143  * process the command line.
144  */
145 struct TsvSummarizeOptions {
146     string programName;
147 
148     /* Options set directly by on the command line.. */
149     size_t[] keyFields;                // -g, --group-by
150     bool hasHeader = false;            // --header
151     bool writeHeader = false;          // -w, --write-header
152     char inputFieldDelimiter = '\t';   // --d|delimiter
153     char valuesDelimiter = '|';        // --v|values-delimiter
154     size_t floatPrecision = 12;        // --p|float-precision
155     bool excludeMissing = false;       // --x|exclude-missing
156     string missingValueReplacement;    // --r|replace-missing
157     bool helpVerbose = false;          // --help-verbose
158     bool versionWanted = false;        // --V|version
159     DList!Operator operators;          // Operators, in the order specified.
160     size_t endFieldIndex = 0;          // Derived value. Max field index used plus one.
161     MissingFieldPolicy globalMissingPolicy = new MissingFieldPolicy;   // Derived value.
162 
163     /* Returns a tuple. First value is true if command line arguments were successfully
164      * processed and execution should continue, or false if an error occurred or the user
165      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
166      *
167      * Returning true (execution continues) means args have been validated and derived
168      * values calculated. In addition, field indices have been converted to zero-based.
169      */
170     auto processArgs (ref string[] cmdArgs) {
171         import std.algorithm : any, each;
172         import std.getopt;
173         import std.path : baseName, stripExtension;
174         import std.typecons : Yes, No;
175         import getopt_inorder;
176         import tsvutil :  makeFieldListOptionHandler;
177 
178         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
179 
180         try
181         {
182             arraySep = ",";    // Use comma to separate values in command line options
183             auto r = getoptInorder(
184                 cmdArgs,
185                 "help-verbose",       "              Print full help.", &helpVerbose,
186 
187                 std.getopt.config.caseSensitive,
188                 "V|version",          "              Print version information and exit.", &versionWanted,
189                 std.getopt.config.caseInsensitive,
190 
191                 "g|group-by",         "<field-list>  Fields to use as key.",
192                 keyFields.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex),
193 
194                 std.getopt.config.caseSensitive,
195                 "H|header",           "              Treat the first line of each file as a header.", &hasHeader,
196                 std.getopt.config.caseInsensitive,
197 
198                 "w|write-header",     "              Write an output header even if there is no input header.", &writeHeader,
199                 "d|delimiter",        "CHR           Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &inputFieldDelimiter,
200                 "v|values-delimiter", "CHR           Values delimiter. Default: vertical bar (|). (Single byte UTF-8 characters only.)", &valuesDelimiter,
201                 "p|float-precision",  "NUM           'Precision' to use printing floating point numbers. Affects the number of digits printed and exponent use. Default: 12", &floatPrecision,
202                 "x|exclude-missing",  "              Exclude missing (empty) fields from calculations.", &excludeMissing,
203                 "r|replace-missing",  "STR           Replace missing (empty) fields with STR in calculations.", &missingValueReplacement,
204                 "count",              "              Count occurrences of each unique key.", &countOptionHandler,
205                 "count-header",       "STR           Count occurrences of each unique key, use header STR.", &countHeaderOptionHandler,
206                 "retain",             "<field-list>  Retain one copy of the field.", &operatorOptionHandler!RetainOperator,
207                 "first",              "<field-list>[:STR]  First value seen.", &operatorOptionHandler!FirstOperator,
208                 "last",               "<field-list>[:STR]  Last value seen.", &operatorOptionHandler!LastOperator,
209                 "min",                "<field-list>[:STR]  Min value. (Numeric fields only.)", &operatorOptionHandler!MinOperator,
210                 "max",                "<field-list>[:STR]  Max value. (Numeric fields only.)", &operatorOptionHandler!MaxOperator,
211                 "range",              "<field-list>[:STR]  Difference between min and max values. (Numeric fields only.)", &operatorOptionHandler!RangeOperator,
212                 "sum",                "<field-list>[:STR]  Sum of the values. (Numeric fields only.)", &operatorOptionHandler!SumOperator,
213                 "mean",               "<field-list>[:STR]  Mean (average). (Numeric fields only.)", &operatorOptionHandler!MeanOperator,
214                 "median",             "<field-list>[:STR]  Median value. (Numeric fields only. Reads all values into memory.)", &operatorOptionHandler!MedianOperator,
215                 "quantile",           "<field-list>:p[,p...][:STR]  Quantiles. One or more fields, then one or more 0.0-1.0 probabilities. (Numeric fields only. Reads all values into memory.)", &quantileOperatorOptionHandler,
216                 "mad",                "<field-list>[:STR]  Median absolute deviation from the median. Raw value, not scaled. (Numeric fields only. Reads all values into memory.)", &operatorOptionHandler!MadOperator,
217                 "var",                "<field-list>[:STR]  Variance. (Sample variance, numeric fields only).", &operatorOptionHandler!VarianceOperator,
218                 "stdev",              "<field-list>[:STR]  Standard deviation. (Sample st.dev, numeric fields only).", &operatorOptionHandler!StDevOperator,
219                 "mode",               "<field-list>[:STR]  Mode. The most frequent value. (Reads all unique values into memory.)", &operatorOptionHandler!ModeOperator,
220                 "mode-count",         "<field-list>[:STR]  Count of the most frequent value. (Reads all unique values into memory.)", &operatorOptionHandler!ModeCountOperator,
221                 "unique-count",       "<field-list>[:STR]  Number of unique values. (Reads all unique values into memory.)", &operatorOptionHandler!UniqueCountOperator,
222                 "missing-count",      "<field-list>[:STR]  Number of missing (empty) fields. Not affected by '--x|exclude-missing' or '--r|replace-missing'.", &operatorOptionHandler!MissingCountOperator,
223                 "not-missing-count",  "<field-list>[:STR]  Number of filled (non-empty) fields. Not affected by '--r|replace-missing'.", &operatorOptionHandler!NotMissingCountOperator,
224                 "values",             "<field-list>[:STR]  All the values, separated by --v|values-delimiter. (Reads all values into memory.)", &operatorOptionHandler!ValuesOperator,
225                 "unique-values",      "<field-list>[:STR]  All the unique values, separated by --v|values-delimiter. (Reads all unique values into memory.)", &operatorOptionHandler!UniqueValuesOperator,
226                 );
227 
228             if (r.helpWanted)
229             {
230                 defaultGetoptPrinter(helpText, r.options);
231                 return tuple(false, 0);
232             }
233             else if (helpVerbose)
234             {
235                 defaultGetoptPrinter(helpTextVerbose, r.options);
236                 return tuple(false, 0);
237             }
238             else if (versionWanted)
239             {
240                 import tsvutils_version;
241                 writeln(tsvutilsVersionNotice("tsv-summarize"));
242                 return tuple(false, 0);
243             }
244 
245             consistencyValidations();
246             derivations();
247         }
248         catch (Exception exc)
249         {
250             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
251             return tuple(false, 1);
252         }
253         return tuple(true, 0);
254     }
255 
256     /* operationOptionHandler functions are callbacks that process command line options
257      * specifying summarization operations. eg. '--max 5', '--last 3:LastEntry'. Handlers
258      * check syntactic correctness and instantiate Operator objects that do the work. This
259      * is also where 1-upped field numbers are converted to 0-based indices.
260      */
261     private void operatorOptionHandler(OperatorClass : SingleFieldOperator)(string option, string optionVal)
262     {
263         import std.range : enumerate;
264         import std.typecons : Yes, No;
265         import tsvutil :  parseFieldList;
266 
267         auto valSplit = findSplit(optionVal, ":");
268 
269         if (valSplit[0].empty || (!valSplit[1].empty && valSplit[2].empty))
270         {
271             throw new Exception(
272                 format("Invalid option value: '--%s %s'. Expected: '--%s <field-list>' or '--%s <field>:<header>'.",
273                        option, optionVal, option, option));
274         }
275 
276         try foreach (fieldNum, fieldIndex;
277                      valSplit[0].to!string
278                      .parseFieldList!(size_t, Yes.convertToZeroBasedIndex).enumerate(1))
279             {
280                 auto op = new OperatorClass(fieldIndex, globalMissingPolicy);
281 
282                 if (!valSplit[2].empty) // Header specified
283                 {
284                     if (fieldNum > 1)
285                     {
286                         throw new Exception(
287                             format("Invalid option: '--%s %s'. Cannot specify a custom header when using multiple fields.",
288                                    option, optionVal));
289                     }
290                     else if (!op.allowCustomHeader)
291                     {
292                         throw new Exception(
293                             format("Invalid option: '--%s %s'. Operator does not support custom headers.",
294                                    option, optionVal));
295                     }
296 
297                     op.setCustomHeader(valSplit[2].to!string);
298                 }
299 
300                 operators.insertBack(op);
301                 if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1;
302             }
303         catch (Exception exc)
304         {
305             import std.format : format;
306             exc.msg = format("[--%s] %s", option, exc.msg);
307             throw exc;
308         }
309     }
310 
311     /* QuantileOperator has a different syntax and needs a custom command option handler. */
312     private void quantileOperatorOptionHandler(string option, string optionVal)
313     {
314         import std.typecons : Yes, No;
315         import tsvutil :  parseFieldList;
316 
317         auto formatErrorMsg(string option, string optionVal)
318         {
319             return format(
320                 "Invalid option value: '--%s %s'. Expected: '--%s <field-list>:<prob>[,<prob>]' or '--%s <field>:<prob>:<header>' where <prob> is a number between 0.0 and 1.0.",
321                 option, optionVal, option, option);
322         }
323 
324         auto split1 = findSplit(optionVal, ":");
325 
326         if (split1[0].empty || (!split1[1].empty && split1[2].empty))
327             throw new Exception(formatErrorMsg(option, optionVal));
328 
329         auto split2 = findSplit(split1[2], ":");
330 
331         if (split2[0].empty || (!split2[1].empty && split2[2].empty))
332             throw new Exception(formatErrorMsg(option, optionVal));
333 
334         auto fieldStr = split1[0];
335         auto probStr = split2[0];
336         auto header = split2[2];
337 
338         size_t[] fieldIndices;
339         double[] probs;
340 
341         try foreach (fieldIndex;
342                      fieldStr.to!string.parseFieldList!(size_t, Yes.convertToZeroBasedIndex))
343             {
344                 fieldIndices ~= fieldIndex;
345             }
346         catch (Exception exc)
347         {
348             import std.format : format;
349             exc.msg = format("[--%s] %s", option, exc.msg);
350             throw exc;
351         }
352 
353         foreach (str; probStr.splitter(','))
354         {
355             double p;
356 
357             try p = str.to!double;
358             catch (Exception exc)
359                 throw new Exception(formatErrorMsg(option, optionVal));
360 
361             if (!(p >= 0.0 && p <= 1.0))
362                 throw new Exception(
363                     format("Invalid option: '--%s %s'. Probability '%g' is not in the interval [0.0,1.0].",
364                            option, optionVal, p));
365 
366             probs ~= p;
367         }
368 
369         if (!header.empty && (fieldIndices.length > 1 || probs.length > 1))
370         {
371             throw new Exception(
372                 format("Invalid option: '--%s %s'. Cannot specify a custom header when using multiple fields or multiple probabilities.",
373                        option, optionVal));
374         }
375 
376         assert (fieldIndices.length > 0);
377         assert (probs.length > 0);
378         assert (header.empty || (fieldIndices.length == 1 && probs.length == 1));
379 
380         foreach (fieldIndex; fieldIndices)
381         {
382             foreach (p; probs)
383             {
384                 auto op = new QuantileOperator(fieldIndex, globalMissingPolicy, p);
385                 if (!header.empty) op.setCustomHeader(header);
386                 operators.insertBack(op);
387             }
388             if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1;
389         }
390     }
391 
392     private void countOptionHandler()
393     {
394         operators.insertBack(new CountOperator());
395     }
396 
397     private void countHeaderOptionHandler(string option, string optionVal)
398     {
399         auto op = new CountOperator();
400         op.setCustomHeader(optionVal);
401         operators.insertBack(op);
402     }
403 
404     /* This routine does validations not handled by processArgs. */
405     private void consistencyValidations()
406     {
407         if (operators.empty)
408         {
409             throw new Exception("At least one summary operator is required.");
410         }
411 
412         if (inputFieldDelimiter == valuesDelimiter)
413         {
414             throw new Exception("Cannot use the same character for both --d|field-delimiter and --v|values-delimiter.");
415         }
416 
417         if (excludeMissing && missingValueReplacement.length != 0)
418         {
419             throw new Exception("Cannot use both '--x|exclude-missing' and '--r|replace-missing'.");
420         }
421     }
422 
423     /* Post-processing derivations. */
424     void derivations()
425     {
426         /* keyFields need to part of the endFieldIndex, which is one past the last field index. */
427         keyFields.each!(delegate (size_t x) { if (x >= endFieldIndex) endFieldIndex = x + 1; } );
428 
429         /* Missing field policy. */
430         globalMissingPolicy.updatePolicy(excludeMissing, missingValueReplacement);
431     }
432 }
433 
434 /** tsvSummarize does the primary work of the tsv-summarize program.
435  */
436 void tsvSummarize(TsvSummarizeOptions cmdopt, in string[] inputFiles)
437 {
438     import tsvutil : throwIfWindowsNewlineOnUnix;
439 
440     /* Pick the Summarizer based on the number of key-fields entered. */
441     auto summarizer =
442         (cmdopt.keyFields.length == 0)
443         ? new NoKeySummarizer!(typeof(stdout.lockingTextWriter()))(
444             cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
445 
446         : (cmdopt.keyFields.length == 1)
447         ? new OneKeySummarizer!(typeof(stdout.lockingTextWriter()))(
448             cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
449 
450         : new MultiKeySummarizer!(typeof(stdout.lockingTextWriter()))(
451             cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy);
452 
453     /* Add the operators to the Summarizer. */
454     summarizer.setOperators(inputRangeObject(cmdopt.operators[]));
455 
456     /* Process each input file, one line at a time. */
457     auto lineFields = new char[][](cmdopt.endFieldIndex);
458     bool headerFound = false;
459     foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"])
460     {
461         auto inputStream = (filename == "-") ? stdin : filename.File();
462         foreach (lineNum, line; inputStream.byLine.enumerate(1))
463         {
464             if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, lineNum);
465 
466             /* Copy the needed number of fields to the fields array.
467              * Note: The number is zero if no operator needs fields. Notably, the count
468              * operator. Used by itself, it counts the number input lines (ala 'wc -l').
469              */
470             if (cmdopt.endFieldIndex > 0)
471             {
472                 size_t fieldIndex = 0;
473                 foreach (fieldValue; line.splitter(cmdopt.inputFieldDelimiter))
474                 {
475                     if (fieldIndex == cmdopt.endFieldIndex) break;
476                     lineFields[fieldIndex] = fieldValue;
477                     fieldIndex++;
478                 }
479 
480                 if (fieldIndex == 0)
481                 {
482                     assert(cmdopt.endFieldIndex > 0);
483                     assert(line.length == 0);
484 
485                     /* Bug work-around. Empty lines are not handled properly by splitter.
486                      *   - Bug: https://issues.dlang.org/show_bug.cgi?id=15735
487                      *   - Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030
488                      * This can arise for: '$ tsv-summarize -g 1 --count'. This counts the
489                      * unique values in field 1. If there's only one column, then an empty
490                      * line becomes an empty string for field 1. Work-around: Point to the
491                      * line. It's an empty string.
492                      */
493                     lineFields[fieldIndex] = line;
494                     fieldIndex++;
495                 }
496 
497                 if (fieldIndex < cmdopt.endFieldIndex)
498                 {
499                     throw new Exception(
500                         format("Not enough fields in line. File: %s, Line: %s",
501                                (filename == "-") ? "Standard Input" : filename, lineNum));
502                 }
503             }
504 
505             if (cmdopt.hasHeader && lineNum == 1)
506             {
507                 if (!headerFound)
508                 {
509                     summarizer.processHeaderLine(lineFields);
510                     headerFound = true;
511                 }
512             }
513             else
514             {
515                 /* Process the line. Processing will fail (throw) if a field cannot be
516                  * converted to the expected type.
517                  */
518                 try summarizer.processNextLine(lineFields);
519                 catch (Exception exc)
520                 {
521                     throw new Exception(
522                         format("Could not process line or field: %s\n  File: %s Line: %s%s",
523                                exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum,
524                                (lineNum == 1) ? "\n  Is this a header line? Use --header to skip." : ""));
525                 }
526             }
527         }
528     }
529 
530     debug writeln("[tsvSummarize] After reading all data.");
531 
532     /* Whew! We're done processing input data. Run the calculations and print. */
533     auto printOptions = SummarizerPrintOptions(
534         cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision);
535     auto stdoutWriter = stdout.lockingTextWriter;
536 
537     if (cmdopt.hasHeader || cmdopt.writeHeader)
538     {
539         summarizer.writeSummaryHeader(stdoutWriter, printOptions);
540     }
541 
542     summarizer.writeSummaryBody(stdoutWriter, printOptions);
543 }
544 
545 /* The default field header. This is used when the input doesn't have field headers,
546  * but field headers are used in the output. The default is "fieldN", where N is the
547  * 1-upped field number.
548  */
549 string fieldHeaderFromIndex(size_t fieldIndex)
550 {
551     enum prefix = "field";
552     return prefix ~ (fieldIndex + 1).to!string;
553 }
554 
555 unittest
556 {
557     assert(fieldHeaderFromIndex(0) == "field1");
558     assert(fieldHeaderFromIndex(10) == "field11");
559 }
560 
561 /* Produce a summary header from a field header. The result has the form
562  * "<fieldHeader>_<operation>". e.g. If the field header is "length" and the operation is
563  * "max", the summary header is "length_max". The field header typically comes a
564  * header line in the input data or was constructed by fieldHeaderFromIndex().
565  *
566  * If operationName is the empty string, then fieldHeader is used unchanged. This supports
567  * the Retain operator.
568  */
569 string summaryHeaderFromFieldHeader(string fieldHeader, string operationName)
570 {
571     return (operationName.length > 0) ? fieldHeader ~ "_" ~ operationName : fieldHeader;
572 }
573 
574 unittest
575 {
576     assert(summaryHeaderFromFieldHeader("originalfield", "mycalc") == "originalfield_mycalc");
577     assert(summaryHeaderFromFieldHeader("originalfield", "") == "originalfield");
578 }
579 
580 /* SummarizerPrintOptions holds printing options for Summarizers and Calculators. Typically
581  * specified with command line options, it is separated out for modularity.
582  */
583 struct SummarizerPrintOptions
584 {
585     char fieldDelimiter;
586     char valuesDelimiter;
587     size_t floatPrecision = 12;
588 
589     import std.traits : isFloatingPoint, isIntegral;
590     auto formatNumber(T)(T n) const
591         if (isFloatingPoint!T || isIntegral!T)
592     {
593         import tsv_numerics : formatNumber;
594         return formatNumber!T(n, floatPrecision);
595     }
596 }
597 
598 /* A Summarizer maintains the state of the summarization and performs basic processing.
599  * Handling of files and input lines is left to the caller.
600  * API:
601  * - addOperator - Called after initializing the object for each operator to be processed.
602  * - processHeaderLine - Called to process the header line of each file. Returns true if
603  *   it was the first header line processed (used when reading multiple files).
604  * - processNextLine - Called to process non-header lines.
605  * - writeSummaryHeader - Called to write the header line.
606  * - writeSummaryBody - Called to write the result lines.
607  */
608 interface Summarizer(OutputRange)
609 {
610     void setOperators(InputRange!Operator op);
611     bool processHeaderLine(const char[][] lineFields);
612     void processNextLine(const char[][] lineFields);
613     void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions);
614     void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions);
615 }
616 
617 /* SummarizerBase performs work shared by all sumarizers, most everything except for
618  * handling of unique keys. The base class handles creation, allocates storage for
619  * Operators and SharedFieldValues, and similar. Derived classes deal primarily with
620  * unique keys and the associated Calculators and UniqueKeyValuesLists.
621  */
622 class SummarizerBase(OutputRange) : Summarizer!OutputRange
623 {
624     private char _inputFieldDelimiter;
625     private bool _hasProcessedFirstHeaderLine = false;
626     private SharedFieldValues _sharedFieldValues = null;  // Null if no shared field value lists.
627     protected MissingFieldPolicy _missingPolicy;
628     protected DList!Operator _operators;
629     protected size_t _numOperators = 0;
630 
631     this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
632     {
633         _inputFieldDelimiter = inputFieldDelimiter;
634         _missingPolicy = missingPolicy;
635     }
636 
637     char inputFieldDelimiter() const @property
638     {
639         return _inputFieldDelimiter;
640     }
641 
642     /* Sets the Operators used by the Summarizer. Called after construction. */
643     void setOperators(InputRange!Operator operators)
644     {
645         foreach (op; operators)
646         {
647             _operators.insertBack(op);
648             _numOperators++;
649             auto numericFieldsToSave = op.numericFieldsToSave();
650             auto textFieldsToSave = op.textFieldsToSave();
651 
652             if (numericFieldsToSave.length > 0 || textFieldsToSave.length > 0)
653             {
654                 if (_sharedFieldValues is null)
655                 {
656                     _sharedFieldValues = new SharedFieldValues();
657                 }
658                 numericFieldsToSave.each!(x => _sharedFieldValues.addNumericIndex(x));
659                 textFieldsToSave.each!(x => _sharedFieldValues.addTextIndex(x));
660             }
661         }
662     }
663 
664     bool processHeaderLine(const char[][] lineFields)
665     {
666         if (!_hasProcessedFirstHeaderLine)
667         {
668             _operators.each!(x => x.processHeaderLine(lineFields));
669             _hasProcessedFirstHeaderLine = true;
670             return true;
671         }
672         else
673         {
674             return false;
675         }
676     }
677 
678     protected final UniqueKeyValuesLists makeUniqueKeyValuesLists()
679     {
680         return (_sharedFieldValues is null)
681             ? null
682             : _sharedFieldValues.makeUniqueKeyValuesLists;
683     }
684 
685     abstract void processNextLine(const char[][] lineFields);
686     abstract void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions);
687     abstract void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions);
688 }
689 
690 /* The NoKeySummarizer is used when summarizing values across the entire input.
691  *
692  * Note: NoKeySummarizer is used in Operator unit tests and gets extensive testing
693  * through that mechanism.
694  */
695 class NoKeySummarizer(OutputRange) : SummarizerBase!OutputRange
696 {
697     private Calculator[] _calculators;
698     private UniqueKeyValuesLists _valueLists;
699 
700     this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
701     {
702         super(inputFieldDelimiter, missingPolicy);
703     }
704 
705     /* Only one Calculator per Operation, so create them as Operators are added. */
706     override void setOperators(InputRange!Operator operators)
707     {
708         super.setOperators(operators);
709         foreach (op; operators) _calculators ~= op.makeCalculator;
710         _valueLists = super.makeUniqueKeyValuesLists();
711     }
712 
713     override void processNextLine(const char[][] lineFields)
714     {
715         _calculators.each!(x => x.processNextLine(lineFields));
716         if (_valueLists !is null) _valueLists.processNextLine(lineFields, _missingPolicy);
717     }
718 
719     override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
720     {
721         put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter));
722         put(outputStream, '\n');
723     }
724 
725     override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
726     {
727         put(outputStream,
728             _calculators[]
729             .map!(x => x.calculate(_valueLists, printOptions))
730             .join(printOptions.fieldDelimiter));
731         put(outputStream, '\n');
732     }
733 }
734 
735 /* KeySummarizerBase does work shared by the single key and multi-key summarizers. The
736  * primary difference between those two is the formation of the key. The primary reason
737  * for separating those into two separate classes is to simplify (speed-up) handling of
738  * single field keys, which are the most common use case.
739  */
740 class KeySummarizerBase(OutputRange) : SummarizerBase!OutputRange
741 {
742     protected struct UniqueKeyData
743     {
744         Calculator[] calculators;
745         UniqueKeyValuesLists valuesLists;
746     }
747 
748     private DList!string _uniqueKeys;
749     private UniqueKeyData[string] _uniqueKeyData;
750 
751     this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
752     {
753         super(inputFieldDelimiter, missingPolicy);
754     }
755 
756     protected void processNextLineWithKey(T : const char[])(T key, const char[][] lineFields)
757     {
758         debug writefln("[%s]: %s", __FUNCTION__, lineFields.to!string);
759 
760         auto dataPtr = (key in _uniqueKeyData);
761         auto data = (dataPtr is null) ? addUniqueKey(key.to!string) : *dataPtr;
762 
763         data.calculators.each!(x => x.processNextLine(lineFields));
764         if (data.valuesLists !is null) data.valuesLists.processNextLine(lineFields, _missingPolicy);
765     }
766 
767     protected UniqueKeyData addUniqueKey(string key)
768     {
769         assert(key !in _uniqueKeyData);
770 
771         _uniqueKeys.insertBack(key);
772 
773         auto calculators = new Calculator[_numOperators];
774         size_t i = 0;
775         foreach (op; _operators)
776         {
777             calculators[i] = op.makeCalculator;
778             i++;
779         }
780 
781         return _uniqueKeyData[key] = UniqueKeyData(calculators, super.makeUniqueKeyValuesLists());
782     }
783 
784     override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
785     {
786         put(outputStream, keyFieldHeader());
787         put(outputStream, printOptions.fieldDelimiter);
788         put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter));
789         put(outputStream, '\n');
790     }
791 
792     override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
793     {
794         foreach(key; _uniqueKeys)
795         {
796             auto data = _uniqueKeyData[key];
797             put(outputStream, key);
798             put(outputStream, printOptions.fieldDelimiter);
799             put(outputStream,
800                 data.calculators[]
801                 .map!(x => x.calculate(data.valuesLists, printOptions))
802                 .join(printOptions.fieldDelimiter));
803             put(outputStream, '\n');
804         }
805     }
806 
807     abstract string keyFieldHeader() const @property;
808 }
809 
810 /* This Summarizer is for the case where the unique key is based on exactly one field.
811  */
812 class OneKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange
813 {
814     private size_t _keyFieldIndex = 0;
815     private string _keyFieldHeader;
816     private DList!string _uniqueKeys;
817 
818     this(size_t keyFieldIndex, char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
819     {
820         super(inputFieldDelimiter, missingPolicy);
821         _keyFieldIndex = keyFieldIndex;
822         _keyFieldHeader = fieldHeaderFromIndex(keyFieldIndex);
823     }
824 
825     override string keyFieldHeader() const @property
826     {
827         return _keyFieldHeader;
828     }
829 
830     override bool processHeaderLine(const char[][] lineFields)
831     {
832         assert(_keyFieldIndex <= lineFields.length);
833 
834         bool isFirstHeaderLine = super.processHeaderLine(lineFields);
835         if (isFirstHeaderLine)
836         {
837             _keyFieldHeader = lineFields[_keyFieldIndex].to!string;
838         }
839         return isFirstHeaderLine;
840     }
841 
842     override void processNextLine(const char[][] lineFields)
843     {
844         assert(_keyFieldIndex < lineFields.length);
845         processNextLineWithKey(lineFields[_keyFieldIndex], lineFields);
846     }
847 }
848 
849 /* This Summarizer is for the case where the unique key is based on multiple fields.
850  */
851 class MultiKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange
852 {
853     private size_t[] _keyFieldIndices;
854     private string _keyFieldHeader;
855     private DList!string _uniqueKeys;
856 
857     this(const size_t[] keyFieldIndices, char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
858     {
859         super(inputFieldDelimiter, missingPolicy);
860         _keyFieldIndices = keyFieldIndices.dup;
861         _keyFieldHeader =
862             _keyFieldIndices.map!(i => fieldHeaderFromIndex(i))
863             .join(inputFieldDelimiter);
864     }
865 
866     override string keyFieldHeader() const @property
867     {
868         return _keyFieldHeader;
869     }
870 
871     override bool processHeaderLine(const char[][] lineFields)
872     {
873         assert(_keyFieldIndices.all!(x => x < lineFields.length));
874         assert(_keyFieldIndices.length >= 2);
875 
876         bool isFirstHeaderLine = super.processHeaderLine(lineFields);
877         if (isFirstHeaderLine)
878         {
879             _keyFieldHeader = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string;
880         }
881         return isFirstHeaderLine;
882     }
883 
884     override void processNextLine(const char[][] lineFields)
885     {
886         assert(_keyFieldIndices.all!(x => x < lineFields.length));
887         assert(_keyFieldIndices.length >= 2);
888 
889         string key = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string;
890         processNextLineWithKey(key, lineFields);
891     }
892 }
893 
894 version(unittest)
895 {
896     /* testSummarizer is a helper that can run many types of unit tests against
897      * Summarizers. It can also test operators, but there are separate helper functions
898      * better suited for that purpose.
899      *
900      * Arguments are a command line args, an input file, and expected output. The
901      * input file and expected output are already split into lines and fields, the helper
902      * manages re-assembly. The program name from the command line args is printed if an
903      * an error occurs, it is useful to identify the test that failed.
904      *
905      * Note: Much of this is a duplication tsvSummarize logic. Better abstraction of
906      * file input/output would enable running unit tests directly on top of tsvSummarize.
907      */
908     void testSummarizer(string[] cmdArgs, string[][] file, string[][] expected)
909     {
910         import std.array : appender;
911 
912         assert(cmdArgs.length > 0, "[testSummarizer] cmdArgs must not be empty.");
913 
914         auto formatAssertMessage(T...)(string msg, T formatArgs)
915         {
916             auto formatString = "[testSummarizer] %s: " ~ msg;
917             return format(formatString, cmdArgs[0], formatArgs);
918         }
919 
920         TsvSummarizeOptions cmdopt;
921         auto savedCmdArgs = cmdArgs.to!string;
922         auto r = cmdopt.processArgs(cmdArgs);
923         assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
924 
925         assert(file.all!(line => line.length >= cmdopt.endFieldIndex),
926                formatAssertMessage("group-by or operator field number greater than number of fields a line of the input file."));
927 
928         /* Pick the Summarizer based on the number of key-fields entered. */
929         auto summarizer =
930             (cmdopt.keyFields.length == 0)
931             ? new NoKeySummarizer!(typeof(appender!(char[])()))(
932                 cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
933 
934             : (cmdopt.keyFields.length == 1)
935             ? new OneKeySummarizer!(typeof(appender!(char[])()))(
936                 cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
937 
938             : new MultiKeySummarizer!(typeof(appender!(char[])()))(
939                 cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy);
940 
941         /* Add the operators to the Summarizer. */
942         summarizer.setOperators(inputRangeObject(cmdopt.operators[]));
943 
944         /* Process the file one line at a time. */
945         auto lineFields = new char[][](cmdopt.endFieldIndex);
946         bool headerFound = false;
947         foreach (lineNum, line; file.enumerate(1))
948         {
949             /* Copy the needed fields to the fields array. */
950             foreach (i, val; line[0..cmdopt.endFieldIndex]) lineFields[i] = val.dup;
951 
952             if (cmdopt.hasHeader && lineNum == 1)
953             {
954                 if (!headerFound)
955                 {
956                     summarizer.processHeaderLine(lineFields);
957                     headerFound = true;
958                 }
959             }
960             else
961             {
962                 try summarizer.processNextLine(lineFields);
963                 catch (Exception exc)
964                 {
965                     assert(false, formatAssertMessage(exc.msg));
966                 }
967             }
968         }
969         auto printOptions = SummarizerPrintOptions(
970         cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision);
971 
972         auto summarizerOutput = appender!(char[])();
973 
974         if (cmdopt.hasHeader || cmdopt.writeHeader)
975         {
976             summarizer.writeSummaryHeader(summarizerOutput, printOptions);
977         }
978 
979         summarizer.writeSummaryBody(summarizerOutput, printOptions);
980         auto expectedOutput = expected.map!(x => x.joiner(cmdopt.inputFieldDelimiter.to!string)).joiner("\n").to!string;
981         if (expectedOutput.length > 0 && expectedOutput[$-1] != '\n') expectedOutput ~= "\n";
982 
983         assert(summarizerOutput.data == expectedOutput,
984                formatAssertMessage(
985                    "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
986                    expectedOutput.to!string, summarizerOutput.data.to!string));
987     }
988 }
989 
990 unittest
991 {
992     /* Summarizer unit tests. Primarily single-key and multi-key summarizers. To a limited
993      * extent, command line option handling (TsvSummarizeOptions). Individual operators
994      * have separate tests, those tests test the no-key summarizer. The Values operator is
995      * used in these tests. It engages a number of behaviors, and the results have limited
996      * ambiguity. Using only one operator limits dependence on individual operators.
997      */
998 
999     auto file1 = [["fld1", "fld2", "fld3"],
1000                   ["a", "a",  "3"],
1001                   ["c", "a",  "2b"],
1002                   ["c", "bc", ""],
1003                   ["a", "c",  "2b"],
1004                   ["",  "bc", ""],
1005                   ["c", "bc", "3"]];
1006 
1007     /* Single-key summarizer tests.
1008      */
1009     testSummarizer(["unittest-sk-1", "--header", "--group-by", "1", "--values", "1"],
1010                    file1,
1011                    [["fld1", "fld1_values"],
1012                     ["a", "a|a"],
1013                     ["c", "c|c|c"],
1014                     ["",  ""]]
1015         );
1016     testSummarizer(["unittest-sk-2", "-H", "--group-by", "1", "--values", "2"],
1017                    file1,
1018                    [["fld1", "fld2_values"],
1019                     ["a", "a|c"],
1020                     ["c", "a|bc|bc"],
1021                     ["",  "bc"]]
1022         );
1023     testSummarizer(["unittest-sk-3", "-H", "-g", "1", "--values", "3"],
1024                    file1,
1025                    [["fld1", "fld3_values"],
1026                     ["a", "3|2b"],
1027                     ["c", "2b||3"],
1028                     ["",  ""]]
1029         );
1030     testSummarizer(["unittest-sk-4", "-H", "--group-by", "1", "--values", "1,2,3"],
1031                    file1,
1032                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1033                     ["a", "a|a",   "a|c",     "3|2b"],
1034                     ["c", "c|c|c", "a|bc|bc", "2b||3"],
1035                     ["",  "",      "bc",      ""]]
1036         );
1037     testSummarizer(["unittest-sk-5", "-H", "--group-by", "1", "--values", "1-3"],
1038                    file1,
1039                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1040                     ["a", "a|a",   "a|c",     "3|2b"],
1041                     ["c", "c|c|c", "a|bc|bc", "2b||3"],
1042                     ["",  "",      "bc",      ""]]
1043         );
1044     testSummarizer(["unittest-sk-6", "-H", "--group-by", "1", "--values", "3,2,1"],
1045                    file1,
1046                    [["fld1", "fld3_values", "fld2_values", "fld1_values"],
1047                     ["a", "3|2b",  "a|c",     "a|a"],
1048                     ["c", "2b||3", "a|bc|bc", "c|c|c"],
1049                     ["",  "",      "bc",      ""]]
1050         );
1051     testSummarizer(["unittest-sk-7", "-H", "--group-by", "1", "--values", "3-1"],
1052                    file1,
1053                    [["fld1", "fld3_values", "fld2_values", "fld1_values"],
1054                     ["a", "3|2b",  "a|c",     "a|a"],
1055                     ["c", "2b||3", "a|bc|bc", "c|c|c"],
1056                     ["",  "",      "bc",      ""]]
1057         );
1058     testSummarizer(["unittest-sk-8", "-H", "--group-by", "2", "--values", "1"],
1059                    file1,
1060                    [["fld2", "fld1_values"],
1061                     ["a",  "a|c"],
1062                     ["bc", "c||c"],
1063                     ["c",  "a"]]
1064         );
1065     testSummarizer(["unittest-sk-9", "-H", "--group-by", "2", "--values", "2"],
1066                    file1,
1067                    [["fld2", "fld2_values"],
1068                     ["a",  "a|a"],
1069                     ["bc", "bc|bc|bc"],
1070                     ["c",  "c"]]
1071         );
1072     testSummarizer(["unittest-sk-10", "-H", "--group-by", "2", "--values", "3"],
1073                    file1,
1074                    [["fld2", "fld3_values"],
1075                     ["a",  "3|2b"],
1076                     ["bc", "||3"],
1077                     ["c",  "2b"]]
1078         );
1079     testSummarizer(["unittest-sk-11", "-H", "--group-by", "2", "--values", "1,3"],
1080                    file1,
1081                    [["fld2", "fld1_values", "fld3_values"],
1082                     ["a",  "a|c",  "3|2b"],
1083                     ["bc", "c||c", "||3"],
1084                     ["c",  "a",    "2b"]]
1085         );
1086     testSummarizer(["unittest-sk-12", "-H", "--group-by", "2", "--values", "3,1"],
1087                    file1,
1088                    [["fld2", "fld3_values", "fld1_values"],
1089                     ["a",  "3|2b", "a|c"],
1090                     ["bc", "||3",  "c||c"],
1091                     ["c",  "2b",   "a"]]
1092         );
1093     testSummarizer(["unittest-sk-13", "-H", "--group-by", "3", "--values", "1"],
1094                    file1,
1095                    [["fld3", "fld1_values"],
1096                     ["3",  "a|c"],
1097                     ["2b", "c|a"],
1098                     ["",   "c|"]]
1099         );
1100     testSummarizer(["unittest-sk-14", "-H", "--group-by", "3", "--values", "2"],
1101                    file1,
1102                    [["fld3", "fld2_values"],
1103                     ["3",  "a|bc"],
1104                     ["2b", "a|c"],
1105                     ["",   "bc|bc"]]
1106         );
1107     testSummarizer(["unittest-sk-15", "-H", "--group-by", "3", "--values", "1,2"],
1108                    file1,
1109                    [["fld3", "fld1_values", "fld2_values"],
1110                     ["3",  "a|c", "a|bc"],
1111                     ["2b", "c|a", "a|c"],
1112                     ["",   "c|",  "bc|bc"]]
1113         );
1114 
1115     /* Multi-key summarizer tests.
1116      */
1117     testSummarizer(["unittest-mk-1", "--header", "--group-by", "1,2", "--values", "1"],
1118                    file1,
1119                    [["fld1", "fld2", "fld1_values"],
1120                     ["a", "a",  "a"],
1121                     ["c", "a",  "c"],
1122                     ["c", "bc", "c|c"],
1123                     ["a", "c",  "a"],
1124                     ["", "bc",  ""]]
1125         );
1126     testSummarizer(["unittest-mk-2", "-H", "--group-by", "1,2", "--values", "2"],
1127                    file1,
1128                    [["fld1", "fld2", "fld2_values"],
1129                     ["a", "a",  "a"],
1130                     ["c", "a",  "a"],
1131                     ["c", "bc", "bc|bc"],
1132                     ["a", "c",  "c"],
1133                     ["", "bc",  "bc"]]
1134         );
1135     testSummarizer(["unittest-mk-3", "-H", "--group-by", "1,2", "--values", "3"],
1136                    file1,
1137                    [["fld1", "fld2", "fld3_values"],
1138                     ["a", "a",  "3"],
1139                     ["c", "a",  "2b"],
1140                     ["c", "bc", "|3"],
1141                     ["a", "c",  "2b"],
1142                     ["", "bc",  ""]]
1143         );
1144     testSummarizer(["unittest-mk-4", "-H", "--group-by", "1,2", "--values", "3,1"],
1145                    file1,
1146                    [["fld1", "fld2", "fld3_values", "fld1_values"],
1147                     ["a", "a",  "3", "a"],
1148                     ["c", "a",  "2b", "c"],
1149                     ["c", "bc", "|3", "c|c"],
1150                     ["a", "c",  "2b", "a"],
1151                     ["",  "bc", "",   ""]]
1152         );
1153     testSummarizer(["unittest-mk-5", "-H", "--group-by", "3,2", "--values", "1"],
1154                    file1,
1155                    [["fld3", "fld2", "fld1_values"],
1156                     ["3",  "a",  "a"],
1157                     ["2b", "a",  "c"],
1158                     ["",   "bc", "c|"],
1159                     ["2b", "c",  "a"],
1160                     ["3",  "bc", "c"]]
1161         );
1162     testSummarizer(["unittest-mk-6", "-H", "--group-by", "3-2", "--values", "1"],
1163                    file1,
1164                    [["fld3", "fld2", "fld1_values"],
1165                     ["3",  "a",  "a"],
1166                     ["2b", "a",  "c"],
1167                     ["",   "bc", "c|"],
1168                     ["2b", "c",  "a"],
1169                     ["3",  "bc", "c"]]
1170         );
1171     testSummarizer(["unittest-mk-7", "-H", "--group-by", "2,1,3", "--values", "2"],
1172                    file1,
1173                    [["fld2", "fld1", "fld3", "fld2_values"],
1174                     ["a",  "a", "3",  "a"],
1175                     ["a",  "c", "2b", "a"],
1176                     ["bc", "c", "",   "bc"],
1177                     ["c",  "a", "2b", "c"],
1178                     ["bc", "",  "",   "bc"],
1179                     ["bc", "c", "3",  "bc"]]
1180         );
1181 
1182     /* Missing policies. */
1183     testSummarizer(["unittest-mis-1", "--header", "--group-by", "1", "--values", "1", "--exclude-missing"],
1184                    file1,
1185                    [["fld1", "fld1_values"],
1186                     ["a", "a|a"],
1187                     ["c", "c|c|c"],
1188                     ["",  ""]]
1189         );
1190     testSummarizer(["unittest-mis-2", "-H", "--group-by", "1", "--values", "2", "-x"],
1191                    file1,
1192                    [["fld1", "fld2_values"],
1193                     ["a", "a|c"],
1194                     ["c", "a|bc|bc"],
1195                     ["",  "bc"]]
1196         );
1197     testSummarizer(["unittest-mis-3", "-H", "-g", "1", "--values", "3", "-x"],
1198                    file1,
1199                    [["fld1", "fld3_values"],
1200                     ["a", "3|2b"],
1201                     ["c", "2b|3"],
1202                     ["",  ""]]
1203         );
1204     testSummarizer(["unittest-mis-4", "-H", "--group-by", "1", "--values", "1,2,3", "-x"],
1205                    file1,
1206                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1207                     ["a", "a|a",   "a|c",     "3|2b"],
1208                     ["c", "c|c|c", "a|bc|bc", "2b|3"],
1209                     ["",  "",      "bc",      ""]]
1210         );
1211     testSummarizer(["unittest-mis-5", "--header", "--group-by", "1", "--values", "1", "--replace-missing", "NA"],
1212                    file1,
1213                    [["fld1", "fld1_values"],
1214                     ["a", "a|a"],
1215                     ["c", "c|c|c"],
1216                     ["",  "NA"]]
1217         );
1218     testSummarizer(["unittest-mis-6", "-H", "--group-by", "1", "--values", "2", "-r", "NA"],
1219                    file1,
1220                    [["fld1", "fld2_values"],
1221                     ["a", "a|c"],
1222                     ["c", "a|bc|bc"],
1223                     ["",  "bc"]]
1224         );
1225     testSummarizer(["unittest-mis-7", "-H", "-g", "1", "--values", "3", "-r", "NA"],
1226                    file1,
1227                    [["fld1", "fld3_values"],
1228                     ["a", "3|2b"],
1229                     ["c", "2b|NA|3"],
1230                     ["",  "NA"]]
1231         );
1232     testSummarizer(["unittest-mis-8", "-H", "--group-by", "1", "--values", "1,2,3", "-r", "NA"],
1233                    file1,
1234                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1235                     ["a", "a|a",   "a|c",     "3|2b"],
1236                     ["c", "c|c|c", "a|bc|bc", "2b|NA|3"],
1237                     ["",  "NA",      "bc",      "NA"]]
1238         );
1239     testSummarizer(["unittest-mis-9", "-H", "--group-by", "1,2", "--values", "3,1", "-x"],
1240                    file1,
1241                    [["fld1", "fld2", "fld3_values", "fld1_values"],
1242                     ["a", "a",  "3", "a"],
1243                     ["c", "a",  "2b", "c"],
1244                     ["c", "bc", "3", "c|c"],
1245                     ["a", "c",  "2b", "a"],
1246                     ["",  "bc", "",   ""]]
1247         );
1248     testSummarizer(["unittest-mis-10", "-H", "--group-by", "3,2", "--values", "1", "-x"],
1249                    file1,
1250                    [["fld3", "fld2", "fld1_values"],
1251                     ["3",  "a",  "a"],
1252                     ["2b", "a",  "c"],
1253                     ["",   "bc", "c"],
1254                     ["2b", "c",  "a"],
1255                     ["3",  "bc", "c"]]
1256         );
1257     testSummarizer(["unittest-mis-11", "-H", "--group-by", "2,1,3", "--values", "2", "-x"],
1258                    file1,
1259                    [["fld2", "fld1", "fld3", "fld2_values"],
1260                     ["a",  "a", "3",  "a"],
1261                     ["a",  "c", "2b", "a"],
1262                     ["bc", "c", "",   "bc"],
1263                     ["c",  "a", "2b", "c"],
1264                     ["bc", "",  "",   "bc"],
1265                     ["bc", "c", "3",  "bc"]]
1266         );
1267     testSummarizer(["unittest-mis-12", "-H", "--group-by", "1,2", "--values", "3,1", "-r", "NA"],
1268                    file1,
1269                    [["fld1", "fld2", "fld3_values", "fld1_values"],
1270                     ["a", "a",  "3", "a"],
1271                     ["c", "a",  "2b", "c"],
1272                     ["c", "bc", "NA|3", "c|c"],
1273                     ["a", "c",  "2b", "a"],
1274                     ["",  "bc", "NA",   "NA"]]
1275         );
1276     testSummarizer(["unittest-mis-13", "-H", "--group-by", "3,2", "--values", "1", "-r", "NA"],
1277                    file1,
1278                    [["fld3", "fld2", "fld1_values"],
1279                     ["3",  "a",  "a"],
1280                     ["2b", "a",  "c"],
1281                     ["",   "bc", "c|NA"],
1282                     ["2b", "c",  "a"],
1283                     ["3",  "bc", "c"]]
1284         );
1285     testSummarizer(["unittest-mis-14", "-H", "--group-by", "2,1,3", "--values", "2", "-r", "NA"],
1286                    file1,
1287                    [["fld2", "fld1", "fld3", "fld2_values"],
1288                     ["a",  "a", "3",  "a"],
1289                     ["a",  "c", "2b", "a"],
1290                     ["bc", "c", "",   "bc"],
1291                     ["c",  "a", "2b", "c"],
1292                     ["bc", "",  "",   "bc"],
1293                     ["bc", "c", "3",  "bc"]]
1294         );
1295 
1296     /* Validate that the no-key summarizer works with testSummarizer helper function.
1297      */
1298     testSummarizer(["unittest-nk-1", "-H", "--values", "1,2"],
1299                    file1,
1300                    [["fld1_values", "fld2_values"],
1301                     ["a|c|c|a||c", "a|a|bc|c|bc|bc"]]
1302         );
1303 
1304     /* Header variations: no header line; auto-generated header line; custom headers.
1305      */
1306     testSummarizer(["unittest-hdr-1", "--group-by", "1", "--values", "1"],
1307                    file1[1..$],
1308                    [["a", "a|a"],
1309                     ["c", "c|c|c"],
1310                     ["",  ""]]
1311         );
1312     testSummarizer(["unittest-hdr-2", "--group-by", "1,2", "--values", "2"],
1313                    file1[1..$],
1314                    [["a", "a",  "a"],
1315                     ["c", "a",  "a"],
1316                     ["c", "bc", "bc|bc"],
1317                     ["a", "c",  "c"],
1318                     ["", "bc",  "bc"]]
1319         );
1320     testSummarizer(["unittest-hdr-3", "--write-header", "--group-by", "2", "--values", "1"],
1321                    file1[1..$],
1322                    [["field2", "field1_values"],
1323                     ["a",  "a|c"],
1324                     ["bc", "c||c"],
1325                     ["c",  "a"]]
1326         );
1327     testSummarizer(["unittest-hdr-4", "-w", "--group-by", "3,2", "--values", "1"],
1328                    file1[1..$],
1329                    [["field3", "field2", "field1_values"],
1330                     ["3",  "a",  "a"],
1331                     ["2b", "a",  "c"],
1332                     ["",   "bc", "c|"],
1333                     ["2b", "c",  "a"],
1334                     ["3",  "bc", "c"]]
1335         );
1336     testSummarizer(["unittest-hdr-5", "-H", "--group-by", "2", "--values", "3:Field3Values"],
1337                    file1,
1338                    [["fld2", "Field3Values"],
1339                     ["a",  "3|2b"],
1340                     ["bc", "||3"],
1341                     ["c",  "2b"]]
1342         );
1343     testSummarizer(["unittest-hdr-6", "-H", "--group-by", "1,2", "--values", "3:FieldThreeValues", "--values", "1:FieldOneValues"],
1344                    file1,
1345                    [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"],
1346                     ["a", "a",  "3", "a"],
1347                     ["c", "a",  "2b", "c"],
1348                     ["c", "bc", "|3", "c|c"],
1349                     ["a", "c",  "2b", "a"],
1350                     ["",  "bc", "",   ""]]
1351         );
1352     testSummarizer(["unittest-hdr-7", "--write-header", "--group-by", "1", "--values", "3:f3_vals","--values", "2:f2_vals", "--values", "1:f1_vals"],
1353                    file1[1..$],
1354                    [["field1", "f3_vals", "f2_vals", "f1_vals"],
1355                     ["a", "3|2b",  "a|c",     "a|a"],
1356                     ["c", "2b||3", "a|bc|bc", "c|c|c"],
1357                     ["",  "",      "bc",      ""]]
1358         );
1359     testSummarizer(["unittest-hdr-8", "--write-header", "--group-by", "1,3,2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2"],
1360                    file1[1..$],
1361                    [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"],
1362                     ["a", "3",  "a",  "3",  "a", "a"],
1363                     ["c", "2b", "a",  "2b", "c", "a"],
1364                     ["c", "",   "bc", "",   "c", "bc"],
1365                     ["a", "2b", "c",  "2b", "a", "c"],
1366                     ["",  "",   "bc", "",   "",  "bc"],
1367                     ["c", "3",  "bc", "3",  "c", "bc"]]
1368         );
1369     testSummarizer(["unittest-hdr-9", "--write-header", "--group-by", "1,3-2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2"],
1370                    file1[1..$],
1371                    [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"],
1372                     ["a", "3",  "a",  "3",  "a", "a"],
1373                     ["c", "2b", "a",  "2b", "c", "a"],
1374                     ["c", "",   "bc", "",   "c", "bc"],
1375                     ["a", "2b", "c",  "2b", "a", "c"],
1376                     ["",  "",   "bc", "",   "",  "bc"],
1377                     ["c", "3",  "bc", "3",  "c", "bc"]]
1378         );
1379 
1380     /* Alternate file widths and lengths.
1381      */
1382 
1383     auto file3x2 = [["fld1", "fld2", "fld3"],
1384                     ["a", "b", "c"],
1385                     ["c", "b", "a"]];
1386 
1387     testSummarizer(["unittest-3x2-1", "-H", "--group-by", "1", "--values", "3"],
1388                    file3x2,
1389                    [["fld1", "fld3_values"],
1390                     ["a", "c"],
1391                     ["c", "a"]]
1392         );
1393     testSummarizer(["unittest-3x2-2", "-H", "--group-by", "2", "--values", "3"],
1394                    file3x2,
1395                    [["fld2", "fld3_values"],
1396                     ["b", "c|a"]]
1397         );
1398     testSummarizer(["unittest-3x2-3", "-H", "--group-by", "2,1", "--values", "3"],
1399                    file3x2,
1400                    [["fld2", "fld1", "fld3_values"],
1401                     ["b", "a", "c"],
1402                     ["b", "c", "a"]]
1403         );
1404 
1405     auto file3x1 = [["fld1", "fld2", "fld3"],
1406                     ["a", "b", "c"]];
1407 
1408     testSummarizer(["unittest-3x1-1", "-H", "--group-by", "1", "--values", "3"],
1409                    file3x1,
1410                    [["fld1", "fld3_values"],
1411                     ["a", "c"]]
1412         );
1413     testSummarizer(["unittest-3x1-2", "--group-by", "1", "--values", "3"],
1414                    file3x1[1..$],
1415                    [["a", "c"]]
1416         );
1417     testSummarizer(["unittest-3x1-3", "-H", "--group-by", "2,1", "--values", "3"],
1418                    file3x1,
1419                    [["fld2", "fld1", "fld3_values"],
1420                     ["b", "a", "c"]]
1421         );
1422     testSummarizer(["unittest-3x1-4", "--group-by", "2,1", "--values", "3"],
1423                    file3x1[1..$],
1424                    [["b", "a", "c"]]
1425         );
1426 
1427     auto file3x0 = [["fld1", "fld2", "fld3"]];
1428 
1429     testSummarizer(["unittest-3x0-1", "-H", "--group-by", "1", "--values", "3"],
1430                    file3x0,
1431                    [["fld1", "fld3_values"]]
1432         );
1433     testSummarizer(["unittest-3x0-2", "--group-by", "1", "--values", "3"],
1434                    file3x0[1..$],
1435                    []
1436         );
1437     testSummarizer(["unittest-3x0-3", "--write-header", "--group-by", "1", "--values", "3"],
1438                    file3x0[1..$],
1439                    [["field1", "field3_values"]]
1440         );
1441 
1442 
1443     testSummarizer(["unittest-3x0-4", "-H", "--group-by", "2,1", "--values", "3"],
1444                    file3x0,
1445                    [["fld2", "fld1", "fld3_values"]]
1446         );
1447 
1448     testSummarizer(["unittest-3x0-5", "--group-by", "2,1", "--values", "3"],
1449                    file3x0[1..$],
1450                    []
1451         );
1452 
1453     testSummarizer(["unittest-3x0-6", "--write-header", "--group-by", "2,1", "--values", "3"],
1454                    file3x0[1..$],
1455                    [["field2", "field1", "field3_values"]]
1456         );
1457 
1458     auto file2x1 = [["fld1", "fld2"],
1459                     ["a", "b"]];
1460 
1461     testSummarizer(["unittest-2x1-1", "-H", "--group-by", "1", "--values", "2"],
1462                    file2x1,
1463                    [["fld1", "fld2_values"],
1464                     ["a", "b"]]
1465         );
1466     testSummarizer(["unittest-2x1-2", "-H", "--group-by", "2,1", "--values", "1"],
1467                    file2x1,
1468                    [["fld2", "fld1", "fld1_values"],
1469                     ["b", "a", "a"]]
1470         );
1471 
1472     auto file2x0 = [["fld1", "fld2"]];
1473 
1474     testSummarizer(["unittest-2x0-1", "-H", "--group-by", "1", "--values", "2"],
1475                    file2x0,
1476                    [["fld1", "fld2_values"]]
1477         );
1478     testSummarizer(["unittest-2x0-2", "-H", "--group-by", "2,1", "--values", "1"],
1479                    file2x0,
1480                    [["fld2", "fld1", "fld1_values"]]
1481         );
1482 
1483     auto file1x2 = [["fld1"],
1484                     ["a"],
1485                     [""]];
1486 
1487     testSummarizer(["unittest-1x2-1", "-H", "--group-by", "1", "--values", "1"],
1488                    file1x2,
1489                    [["fld1", "fld1_values"],
1490                     ["a", "a"],
1491                     ["",  ""]]
1492         );
1493 
1494     auto file1x2b = [["fld1"],
1495                      [""],
1496                      [""]];
1497 
1498     testSummarizer(["unittest-1x2b-2", "-H", "--group-by", "1", "--values", "1"],
1499                    file1x2b,
1500                    [["fld1", "fld1_values"],
1501                     ["", "|"]]
1502         );
1503 
1504     auto file1x1 = [["fld1"],
1505                     ["x"]];
1506 
1507     testSummarizer(["unittest-1x1-1", "-H", "--group-by", "1", "--values", "1"],
1508                    file1x1,
1509                    [["fld1", "fld1_values"],
1510                     ["x", "x"]]
1511         );
1512 
1513     testSummarizer(["unittest-1x1-2", "--group-by", "1", "--values", "1"],
1514                    file1x1[1..$],
1515                    [["x", "x"]]
1516         );
1517 
1518     testSummarizer(["unittest-1x1-3", "--write-header", "--group-by", "1", "--values", "1"],
1519                    file1x1[1..$],
1520                    [["field1", "field1_values"],
1521                     ["x", "x"]]
1522         );
1523 
1524     auto file1x1b = [["fld1"],
1525                     [""]];
1526 
1527     testSummarizer(["unittest-1x1b-1", "-H", "--group-by", "1", "--values", "1"],
1528                    file1x1b,
1529                    [["fld1", "fld1_values"],
1530                     ["", ""]]
1531         );
1532 
1533     auto file1x0 = [["fld1"]];
1534 
1535     testSummarizer(["unittest-1x0-1", "-H", "--group-by", "1", "--values", "1"],
1536                    file1x0,
1537                    [["fld1", "fld1_values"]]
1538         );
1539 
1540     testSummarizer(["unittest-1x0-2", "--group-by", "1", "--values", "1"],
1541                    file1x0[1..$],
1542                    []
1543         );
1544 
1545     testSummarizer(["unittest-1x0-3", "--write-header", "--group-by", "1", "--values", "1"],
1546                    file1x0[1..$],
1547                    [["field1", "field1_values"]]
1548         );
1549 
1550     /* Alternate delimiters. */
1551     testSummarizer(["unittest-delim-1", "-H", "--values", "1,2", "--delimiter", "%"],
1552                    file1,
1553                    [["fld1_values", "fld2_values"],
1554                     ["a|c|c|a||c", "a|a|bc|c|bc|bc"]]
1555         );
1556     testSummarizer(["unittest-delim-2", "-H", "--values", "1-2", "--values-delimiter", "$"],
1557                    file1,
1558                    [["fld1_values", "fld2_values"],
1559                     ["a$c$c$a$$c", "a$a$bc$c$bc$bc"]]
1560         );
1561     testSummarizer(["unittest-delim-3", "-H", "--values", "1,2", "--delimiter", "#", "--values-delimiter", ","],
1562                    file1,
1563                    [["fld1_values", "fld2_values"],
1564                     ["a,c,c,a,,c", "a,a,bc,c,bc,bc"]]
1565         );
1566     testSummarizer(["unittest-delim-4", "--write-header", "--group-by", "2", "--values", "1",
1567                     "--delimiter", "^", "--values-delimiter", ":"],
1568                    file1[1..$],
1569                    [["field2", "field1_values"],
1570                     ["a",  "a:c"],
1571                     ["bc", "c::c"],
1572                     ["c",  "a"]]
1573         );
1574     testSummarizer(["unittest-delim-5", "--group-by", "1,2", "--values", "2", "--delimiter", "/",
1575                     "--values-delimiter", "\\"],
1576                    file1[1..$],
1577                    [["a", "a",  "a"],
1578                     ["c", "a",  "a"],
1579                     ["c", "bc", "bc\\bc"],
1580                     ["a", "c",  "c"],
1581                     ["", "bc",  "bc"]]
1582         );
1583 }
1584 
1585 /* Summary Operators and Calculators
1586  *
1587  * Two types of objects are used in implementation: Operators and Calculators. An Operator
1588  * represents a summary calculation specified on the command line, e.g. '--mean 5'. A
1589  * Calculator is used to manage the summary calculation for each unique key in the input.
1590  *
1591  * As an example, consider the command:
1592  *
1593  *    $tsv-summarize --group-by 1 --mean 3 --mean 5
1594  *
1595  * This command will create two instances of a MeanOperator, one each for fields 3 and 5.
1596  * They produce the output field headers (e.g. "field3_mean", "field5_mean"). They also
1597  * create MeanCalculator objects for each unique value in field 1. For 'mean', a
1598  * calculator needs to track occurrence count and sum. Calculators produce the final
1599  * value when all processing is finished.
1600  *
1601  * Summary field headers
1602  *
1603  * There are several options for specifying summary field headers. The defaults combine the
1604  * operator name and the header of the field summarized. The defaults can be overridden on
1605  * on the command line. These scenarios are supported via the operator constructor and the
1606  * processHeaderLine() method.
1607  *
1608  * Missing field policy
1609  *
1610  * At present, tsv-summarize has a single policy for handling missing values that applies
1611  * to all operators. However, it is logically operator specific and is implemented that
1612  * way. The MissingFieldPolicy struct describes the policy, each operator contains one.
1613  * Calculators access thier operator's policy struct.
1614  */
1615 
1616 interface Operator
1617 {
1618     @property string header();
1619     @property string name();
1620     void processHeaderLine(const char[][] fields);
1621     size_t[] numericFieldsToSave();     // Numeric fields this Operator needs saved
1622     size_t[] textFieldsToSave();        // Text fields this Operator needs saved
1623     Calculator makeCalculator();
1624 }
1625 
1626 interface Calculator
1627 {
1628     void processNextLine(const char[][] fields);
1629     string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions);
1630 }
1631 
1632 class MissingFieldPolicy
1633 {
1634     private bool _useMissing = true;          // True if missing values are processed unchanged.
1635     private bool _replaceMissing = false;     // True if missing values are replaced.
1636     private string _missingReplacement;       // Replacement string if replaceMissing is true.
1637 
1638     this (in bool excludeMissing = false, in string missingReplacement = "")
1639     {
1640         updatePolicy(excludeMissing, missingReplacement);
1641     }
1642 
1643     void updatePolicy(in bool excludeMissing, in string missingReplacement)
1644     {
1645         _missingReplacement = missingReplacement;
1646         _replaceMissing = missingReplacement.length != 0;
1647         _useMissing = !excludeMissing && !replaceMissing;
1648     }
1649 
1650     final bool isMissingField(const char[] field) const
1651     {
1652         return field.length == 0;
1653     }
1654 
1655     final bool useMissing() const @property
1656     {
1657         return _useMissing;
1658     }
1659 
1660     final bool excludeMissing() const @property
1661     {
1662         return !_useMissing && !_replaceMissing;
1663     }
1664 
1665     final bool replaceMissing() const @property
1666     {
1667         return _replaceMissing;
1668     }
1669 
1670     final string missingReplacement() const @property
1671     {
1672         return _missingReplacement;
1673     }
1674 }
1675 
1676 /* The SharedFieldValues and UniqueKeyValuesLists classes manage lists of values collected
1677  * while reading data. Operations like median collect all values and operate on them when
1678  * running the final calculation. Value lists are needed for each unique key. A command
1679  * using multiple Operators may save multiple fields. And, different Operators may be run
1680  * against the same field.
1681  *
1682  * The last part motivates these classes. Handling large data sets necessitates minimizing
1683  * in-memory storage, making it desirable to share identical lists between Calculators.
1684  * Otherwise, each Calculator could implement its own storage, which would be simpler.
1685  *
1686  * The setup works as follows:
1687  *  - Operators advertise fields they need saved ([text|numeric]FieldsToSave methods).
1688  *  - The SummarizerBase object keeps a SharedFieldValues object, which in turn keeps list
1689  *    of the fields advertised by Operators as needing sharing. This list gets created
1690  *    during command initialization (SummarizerBase.setOperators).
1691  *  - The SharedFieldValues object is used to create a UniqueKeyValuesLists object every
1692  *    time a new unique key is found, in parellel to the Calculator objects created for the
1693  *    key. The UniqueKeyValuesLists objects are managed by derived Summarizer classes.
1694  *  - A unique key's UniqueKeyValuesLists object is passed each input line, same as
1695  *    Calculators, saving the values.
1696  *  - Calculators retrieve the saved values during the calculation phase. The calculator's
1697  *    ProcessNextField method is typically a no-op.
1698  *  - Calculators cannot make assumptions about the order of the saved values. This is
1699  *    pragmatic concession to median and quantile calculations, which need to sort the data,
1700  *    at least partially. Rather than generate sorted copies, the current algorithms
1701  *    sort the data in place.
1702  *
1703  * One concession to duplicate storage is that text and numeric versions of the same
1704  * field might be stored. The reason is because it's important to convert text to numbers
1705  * as they are read so that useful error messages can be generated. And, storing both
1706  * forms of the same field should be less common.
1707  *
1708  * The current implementation uses the same missing values policy for all fields. If
1709  * multiple policies become supported this will need to change.
1710  *
1711  * Built-in calculations - UniqueKeyValueLists have a built-in median operation. This is
1712  * to avoid repeated calculations of the median by different calculations.
1713  */
1714 
1715 class SharedFieldValues
1716 {
1717     // Arrays with field indices that need to be saved.
1718     private size_t[] _numericFieldIndices;
1719     private size_t[] _textFieldIndices;
1720 
1721     /* Called during summarizer setup to add a shared field value for a specific field index.
1722      * eg. '--median 7' will add end up calling addNumericIdex(6), 6 being the zero-based index.
1723      * A specific index is only added once.
1724      */
1725     final void addNumericIndex (size_t index)
1726     {
1727         if (!canFind(_numericFieldIndices, index)) _numericFieldIndices ~= index;
1728     }
1729 
1730     /* Similar to addNumericIndex, except adds a text index. */
1731     final void addTextIndex (size_t index)
1732     {
1733         if (!canFind(_textFieldIndices, index)) _textFieldIndices ~= index;
1734     }
1735 
1736     /* Called every time a new key is found, or once at the beginning of the program if no keys
1737      * are being used (entire column summarized).
1738      */
1739     final UniqueKeyValuesLists makeUniqueKeyValuesLists()
1740     {
1741         return new UniqueKeyValuesLists(_numericFieldIndices, _textFieldIndices);
1742     }
1743 }
1744 
1745 class UniqueKeyValuesLists
1746 {
1747     /* A FieldValues object holds is a list of values collect for a specific field. A
1748      * unique key may hold several. For example, the command:
1749      *     $ tsv-summarize --k 1 --median 4 -- median 5
1750      * requires keeping lists for both fields 4 and 5. This in turn will result in a
1751      * _numericFieldValues being a 2 element array, one with a list of field 4 values,
1752      * the second of field 5 values. Linear search is used to find a specific field.
1753      */
1754     private FieldValues!double[] _numericFieldValues;
1755     private FieldValues!string[] _textFieldValues;
1756     private double[] _numericFieldMedians;
1757 
1758     /* The UniqueKeyValuesLists constructor takes arrays of field indices to be saved. */
1759     this(const size_t[] numericFieldIndices, const size_t[] textFieldIndices)
1760     {
1761         if (numericFieldIndices.length > 0)
1762         {
1763             _numericFieldValues = new FieldValues!double[](numericFieldIndices.length);
1764             foreach (i, fieldIndex; numericFieldIndices)
1765                 _numericFieldValues[i] = new FieldValues!double(fieldIndex);
1766         }
1767 
1768         if (textFieldIndices.length > 0)
1769         {
1770             _textFieldValues = new FieldValues!string[](textFieldIndices.length);
1771             foreach (i, fieldIndex; textFieldIndices)
1772                 _textFieldValues[i] = new FieldValues!string(fieldIndex);
1773         }
1774     }
1775 
1776     void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy)
1777     {
1778         _numericFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy));
1779         _textFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy));
1780     }
1781 
1782     private FieldValues!double findNumericFieldValues(size_t index)
1783     {
1784         alias pred = (FieldValues!double a, size_t b) => (a.fieldIndex == b);
1785         auto r = find!pred(_numericFieldValues, index);
1786         assert(!r.empty);
1787         return r.front;
1788     }
1789 
1790     private FieldValues!string findTextFieldValues(size_t index)
1791     {
1792         alias pred = (FieldValues!string a, size_t b) => (a.fieldIndex == b);
1793         auto r = find!pred(_textFieldValues, index);
1794         assert(!r.empty);
1795         return r.front;
1796     }
1797 
1798     final double[] numericValues(size_t index)
1799     {
1800         return findNumericFieldValues(index).getArray;
1801     }
1802 
1803     final double[] numericValuesSorted(size_t index)
1804     {
1805         return findNumericFieldValues(index).getSortedArray;
1806     }
1807 
1808     final string[] textValues(size_t index)
1809     {
1810         return findTextFieldValues(index).getArray;
1811     }
1812 
1813     final string[] textValuesSorted(size_t index)
1814     {
1815         return findTextFieldValues(index).getSortedArray;
1816     }
1817 
1818     final double numericValuesMedian(size_t index)
1819     {
1820         return findNumericFieldValues(index).median;
1821     }
1822 
1823     private class FieldValues(ValueType)
1824     {
1825         import std.array : appender;
1826         private size_t _fieldIndex;
1827         private Appender!(ValueType[]) _values;
1828         private bool _haveMedian = false;
1829         private bool _isSorted = false;
1830         private ValueType _medianValue;
1831 
1832         this(size_t fieldIndex)
1833         {
1834             _fieldIndex = fieldIndex;
1835         }
1836 
1837         final size_t length() const @property
1838         {
1839             return _values.data.length;
1840         }
1841 
1842         final size_t fieldIndex() const @property
1843         {
1844             return _fieldIndex;
1845         }
1846 
1847         final void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy)
1848         {
1849             debug writefln("[%s]: %s", __FUNCTION__, fields.to!string);
1850 
1851             const char[] field = fields[_fieldIndex];
1852             if (missingPolicy.useMissing || !missingPolicy.isMissingField(field))
1853             {
1854                 _values.put(field.to!ValueType);
1855                 _haveMedian = false;
1856                 _isSorted = false;
1857             }
1858             else if (missingPolicy.replaceMissing)
1859             {
1860                 _values.put(missingPolicy.missingReplacement.to!ValueType);
1861                 _haveMedian = false;
1862                 _isSorted = false;
1863             }
1864         }
1865 
1866         /* Return an input range of the values. */
1867         final auto values()
1868         {
1869             return _values.data;
1870         }
1871 
1872         final ValueType[] getArray()
1873         {
1874             return _values.data;
1875         }
1876 
1877         final ValueType[] getSortedArray()
1878         {
1879             if (!_isSorted)
1880             {
1881                 import std.algorithm : sort;
1882                 sort(_values.data);
1883                 _isSorted = true;
1884             }
1885             return _values.data;
1886         }
1887 
1888         final ValueType median()
1889         {
1890             if (!_haveMedian)
1891             {
1892                 import tsv_numerics : rangeMedian;
1893                 _medianValue = _values.data.rangeMedian();
1894                 _haveMedian = true;
1895             }
1896 
1897             return _medianValue;
1898         }
1899     }
1900 }
1901 
1902 /* SingleFieldOperator is a base class for single field operators, the most common
1903  * Operator. Derived classes implement makeCalculator and the Calculator class it returns.
1904  */
1905 class SingleFieldOperator : Operator
1906 {
1907     import std.typecons : Flag;
1908 
1909     private string _name;
1910     private string _header;
1911     private size_t _fieldIndex;
1912     private bool _useHeaderSuffix;
1913     private bool _allowCustomHeader;
1914     private bool _hasCustomHeader = false;
1915     private size_t[] _numericFieldsToSave;
1916     private size_t[] _textFieldsToSave;
1917     private MissingFieldPolicy _missingPolicy;
1918 
1919     this(string operatorName, size_t fieldIndex, MissingFieldPolicy missingPolicy,
1920          Flag!"useHeaderSuffix" useHeaderSuffix = Yes.useHeaderSuffix,
1921          Flag!"allowCustomHeader" allowCustomHeader = Yes.allowCustomHeader)
1922     {
1923         _name = operatorName;
1924         _fieldIndex = fieldIndex;
1925         _missingPolicy = missingPolicy;
1926         _useHeaderSuffix = useHeaderSuffix;
1927         _allowCustomHeader = allowCustomHeader;
1928         // Default header. May be overrridden by custom header or header line.
1929         _header =
1930             fieldHeaderFromIndex(fieldIndex)
1931             .summaryHeaderFromFieldHeader(_useHeaderSuffix ? operatorName : "");
1932     }
1933 
1934     void setCustomHeader (string customHeader)
1935     {
1936         assert(_allowCustomHeader);
1937         _header = customHeader;
1938         _hasCustomHeader = true;
1939     }
1940 
1941     final string name() const @property
1942     {
1943         return _name;
1944     }
1945 
1946     final bool allowCustomHeader() const @property
1947     {
1948         return _allowCustomHeader;
1949     }
1950 
1951     /* saveFieldValues[Numeric|Text] are called by derived classes to indicate that field
1952      * that the field values should be saved. These should called during construction.
1953      */
1954     final void setSaveFieldValuesNumeric()
1955     {
1956         _numericFieldsToSave ~= _fieldIndex;
1957     }
1958 
1959     final void setSaveFieldValuesText()
1960     {
1961         _textFieldsToSave ~= _fieldIndex;
1962     }
1963 
1964     final MissingFieldPolicy missingPolicy() @property
1965     {
1966         return _missingPolicy;
1967     }
1968 
1969     final size_t fieldIndex() const @property
1970     {
1971         return _fieldIndex;
1972     }
1973 
1974     final string header() const @property
1975     {
1976         return _header;
1977     }
1978 
1979     final bool useHeaderSuffix() const @property
1980     {
1981         return _useHeaderSuffix;
1982     }
1983 
1984     void processHeaderLine(const char[][] fields)
1985     {
1986         if (!_hasCustomHeader) {
1987             debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string);
1988             _header = summaryHeaderFromFieldHeader(fields[_fieldIndex].to!string,
1989                                                    _useHeaderSuffix ? _name : "");
1990         }
1991     }
1992 
1993     final size_t[] numericFieldsToSave()
1994     {
1995         return _numericFieldsToSave;
1996     }
1997 
1998     final size_t[] textFieldsToSave()
1999     {
2000         return _textFieldsToSave;
2001     }
2002 
2003     abstract SingleFieldCalculator makeCalculator();
2004 }
2005 
2006 /* SingleFieldCalculator is a base class for the common case of calculators using a single
2007  * field. Derived classes implement processNextField() rather than processNextLine().
2008  */
2009 class SingleFieldCalculator : Calculator
2010 {
2011     private size_t _fieldIndex;
2012 
2013     this(size_t fieldIndex)
2014     {
2015         _fieldIndex = fieldIndex;
2016     }
2017 
2018     final size_t fieldIndex() const @property
2019     {
2020         return _fieldIndex;
2021     }
2022 
2023     final void processNextLine(const char[][] fields)
2024     {
2025         debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string);
2026 
2027         auto missingPolicy = getOperator.missingPolicy;
2028         const char[] field = fields[_fieldIndex];
2029 
2030         if (missingPolicy.useMissing || !missingPolicy.isMissingField(field))
2031         {
2032             processNextField(field);
2033         }
2034         else if (missingPolicy.replaceMissing)
2035         {
2036             processNextField(missingPolicy.missingReplacement);
2037         }
2038     }
2039 
2040     abstract SingleFieldOperator getOperator();
2041 
2042     abstract void processNextField(const char[] field);
2043 }
2044 
2045 /* Unittest helper functions. Only compiled when -unittest is in effect. */
2046 version(unittest)
2047 {
2048     /** A helper for SingleFieldOperator unit tests.
2049      *
2050      * testSingleFieldOperator takes a set of split file values, a field index, a header
2051      * suffix, and a set of expected values. The expected values array contains the
2052      * initial value (zero entries) and the expected values after each line. (One more
2053      * expected value than input lines.) The zero entry case is what is generated for an
2054      * empty file. An example testing the 'min' operator against a file with 2 columns,
2055      * 3 rows, using field index 1:
2056      *
2057      *    testSingleFieldOperator!MinOperator(
2058      *       [["10", "100"],               // The split file. 3 lines by 2 rows.
2059      *        ["5", "50"],
2060      *        ["20", "200"]],
2061      *       1,                            // Field index (zero-based, so "100", "50", "200")
2062      *       "min",                        // The header suffix, normally the operator name.
2063      *       ["nan", "100", "50", "50"]);  // Min value after processing each line.
2064      *
2065      * A typical operator unit test uses three "files", one each of 1x3, 2x3, and 3x3.
2066      * Then run the operator is tested against each column, a total of six calls. Headers
2067      * are automatically checked. Additional entries can be used to extend coverage.
2068      *
2069      * A non-default MissingFieldPolicy can be provide as an optional last argument.
2070      * Operator tests should include exclusion and replacement variations. See operator
2071      * unit tests for details.
2072      *
2073      * The testSingleFieldOperatorBase adds an additional capability - Custom operator
2074      * init arguments. Currently this is used only by the quantile operator.
2075      *
2076      * These tests do not check unique key behavior (group-by). Operators don't have info
2077      * about unique keys, and interact with them only indirectly, via Calculators.
2078      */
2079     void testSingleFieldOperator(OperatorClass : SingleFieldOperator)
2080         (const char[][][] splitFile, size_t fieldIndex, string headerSuffix,
2081          const char[][] expectedValues,
2082          MissingFieldPolicy missingPolicy = new MissingFieldPolicy)
2083     {
2084         testSingleFieldOperatorBase!OperatorClass(splitFile, fieldIndex, headerSuffix, expectedValues, missingPolicy);
2085     }
2086 
2087     void testSingleFieldOperatorBase(OperatorClass : SingleFieldOperator, T...)
2088         (const char[][][] splitFile, size_t fieldIndex, string headerSuffix,
2089          const char[][] expectedValues,
2090          MissingFieldPolicy missingPolicy,
2091          T extraOpInitArgs)
2092     {
2093         import std.format : format;
2094         import std.array : appender;
2095         import std.string : chomp;
2096         import std.traits : EnumMembers;
2097 
2098         auto numFields = (splitFile[0]).length;
2099 
2100         assert(fieldIndex < numFields,
2101                format("[testSingleFieldOperator] Invalid field index. headerSuffix: %s",
2102                       headerSuffix));
2103         assert(splitFile.length + 1 == expectedValues.length,
2104                format("[testSingleFieldOperator] Need one more expected value than number of rows. headerSuffix: %s",
2105                       headerSuffix));
2106 
2107         /* printOptions - Only the 'values-delimiter' (2nd arg) is used these tests. */
2108         auto printOptions = SummarizerPrintOptions('#', '|');
2109 
2110         /* An input header line. */
2111         string[] inputHeaderLine = new string[numFields];
2112         foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string;
2113 
2114         /* The different expected output field headers. */
2115         auto outputFieldHeaderWithNoHeaderLine =
2116             fieldHeaderFromIndex(fieldIndex)
2117             .summaryHeaderFromFieldHeader(headerSuffix);
2118         auto outputFieldHeaderFromHeaderLine =
2119             inputHeaderLine[fieldIndex]
2120             .summaryHeaderFromFieldHeader(headerSuffix);
2121         auto customOutputFieldHeader = "custom";
2122 
2123         enum HeaderUsecase {
2124             HeaderLine_DefaultHeader,
2125             HeaderLine_CustomHeader,
2126             NoHeaderLine_DefaultHeader,
2127             NoHeaderLine_CustomHeader,
2128             NoHeaderLine_NoOutputHeader,
2129         }
2130 
2131         string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected)
2132         {
2133             return format("[testSingleFieldOperator] Unexpected header. Operator: %s; Usecase: %s;  Actual: '%s';  Expected: '%s'",
2134                           op.name, hc, actual, expected);
2135         }
2136 
2137         string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex, size_t fieldIndex,
2138                                   const char[] actual, const char[] expected)
2139         {
2140             return format("[testSingleFieldOperator] Unexpected value. Operator: %s; Usecase: %s;  RowIndex: %d, FieldIndex: %d\n    Actual: '%s';  Expected: '%s'",
2141                           op.name, hc, rowIndex, fieldIndex, actual, expected);
2142         }
2143 
2144         /* Run the logic for each header use case. */
2145         foreach (hc; EnumMembers!HeaderUsecase)
2146         {
2147             bool hasInputHeader = (
2148                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2149                 hc == HeaderUsecase.HeaderLine_CustomHeader
2150                 );
2151             bool hasOutputHeader = (
2152                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2153                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2154                 hc == HeaderUsecase.NoHeaderLine_DefaultHeader ||
2155                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2156                 );
2157             bool hasCustomHeader = (
2158                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2159                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2160                 );
2161 
2162             if (hasCustomHeader) assert(hasOutputHeader);
2163 
2164             auto op = new OperatorClass(fieldIndex, missingPolicy, extraOpInitArgs);
2165 
2166             if (hasCustomHeader)
2167             {
2168                 if (!op.allowCustomHeader) continue;   // Custom header not support by this operator
2169                 op.setCustomHeader(customOutputFieldHeader);
2170             }
2171 
2172             Operator[] operatorArray;
2173             operatorArray ~= op;
2174 
2175             auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy);
2176             summarizer.setOperators(inputRangeObject(operatorArray));
2177 
2178             if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine);
2179 
2180             if (hasOutputHeader)
2181             {
2182                 /* Write the header line. Note that this is a one-field header, */
2183                 auto headerLineOutput = appender!(char[])();
2184                 summarizer.writeSummaryHeader(headerLineOutput, printOptions);
2185 
2186                 /* Test that the header was generated correctly.
2187                  *
2188                  * Note: Because the output is generated by a Summarizer, it will have a
2189                  * trailing newline. Use chomp to trim it.
2190                  */
2191                 final switch (hc)
2192                 {
2193                 case HeaderUsecase.HeaderLine_DefaultHeader:
2194                     assert(headerLineOutput.data.chomp == outputFieldHeaderFromHeaderLine,
2195                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2196                                                outputFieldHeaderFromHeaderLine));
2197                     break;
2198                 case HeaderUsecase.NoHeaderLine_DefaultHeader:
2199                     assert(headerLineOutput.data.chomp == outputFieldHeaderWithNoHeaderLine,
2200                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2201                                                outputFieldHeaderWithNoHeaderLine));
2202                     break;
2203                 case HeaderUsecase.HeaderLine_CustomHeader:
2204                 case HeaderUsecase.NoHeaderLine_CustomHeader:
2205                     assert(headerLineOutput.data.chomp == customOutputFieldHeader,
2206                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2207                                                customOutputFieldHeader));
2208                     break;
2209                 case HeaderUsecase.NoHeaderLine_NoOutputHeader:
2210                     break;
2211                }
2212 
2213             }
2214 
2215             /* For each line, process the line, generate the output, and test that the
2216              * value is correct. Start with the empty file case.
2217              */
2218             foreach (i, const char[] expected; expectedValues)
2219             {
2220                 if (i > 0) summarizer.processNextLine(splitFile[i - 1]);
2221                 auto summaryLineOutput = appender!(char[])();
2222                 summarizer.writeSummaryBody(summaryLineOutput, printOptions);
2223                 assert(summaryLineOutput.data.chomp == expected,
2224                        valueAssertMessage(operatorArray[0], hc, i, fieldIndex,
2225                                           summaryLineOutput.data.chomp, expectedValues[i]));
2226             }
2227         }
2228     }
2229 }
2230 
2231 /* ZeroFieldOperator is a base class for operators that take no input. The main use
2232  * case is the CountOperator, which counts the occurrences of each unique key. Other
2233  * uses are possible, for example, weighted random number assignment.
2234  *
2235  * The primary rationale for ZeroFieldOperator and ZeroFieldCalculator is to clarify
2236  * the information available to such a routine. In particular, the split fields passed
2237  * to processHeaderLine and processNextLine don't include all fields in the input,
2238  * something that might not be obvious when implementing an operator. (Only fields
2239  * required by operators acting on specific fields are included.)
2240  */
2241 class ZeroFieldOperator : Operator
2242 {
2243     import std.typecons : Flag;
2244 
2245     private string _name;
2246     private string _header;
2247 
2248     this(string operatorName)
2249     {
2250         _name = operatorName;
2251         _header = operatorName;
2252     }
2253 
2254     void setCustomHeader (string customHeader)
2255     {
2256         _header = customHeader;
2257     }
2258 
2259     bool allowCustomHeader() const @property
2260     {
2261         return true;
2262     }
2263 
2264     final string name() const @property
2265     {
2266         return _name;
2267     }
2268 
2269     final string header() const @property
2270     {
2271         return _header;
2272     }
2273 
2274     /* A no-op. ZeroFieldOperators have no access to the header line. */
2275     final void processHeaderLine(const char[][] fields) { }
2276 
2277     /* A no-op. ZeroFieldOperators have no access to fields. */
2278     final size_t[] numericFieldsToSave()
2279     {
2280         size_t[] emptyArray;
2281         return emptyArray;
2282     }
2283 
2284     /* A no-op. ZeroFieldOperators have no access to fields. */
2285     final size_t[] textFieldsToSave()
2286     {
2287         size_t[] emptyArray;
2288         return emptyArray;
2289     }
2290 
2291     abstract ZeroFieldCalculator makeCalculator();
2292 }
2293 
2294 /* ZeroFieldCalculator is a base class for operators that don't use fields as input.
2295  * In particular, the Count operator. It is a companion to the ZeroFieldOperator class.
2296  *
2297  * Derived classes implement processNextEntry() rather than processNextLine(), and the
2298  * single argument form of calculate() given as an abstract function.
2299  */
2300 class ZeroFieldCalculator : Calculator
2301 {
2302     this() { }
2303 
2304     final void processNextLine(const char[][] fields)
2305     {
2306         debug writefln("[%s]", __FUNCTION__,);
2307         processNextEntry();
2308     }
2309 
2310     final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2311     {
2312         return calculate(printOptions);
2313     }
2314 
2315     abstract void processNextEntry();
2316     abstract string calculate(const ref SummarizerPrintOptions printOptions);
2317 }
2318 
2319 version(unittest)
2320 {
2321     /** A helper for ZeroFieldOperator unit tests.
2322      *
2323      * testZeroFieldOperator takes a set of split file values, a default header, and a
2324      * set of expected values. The expected values array contains the expected values
2325      * after each line.
2326      *
2327      * testZeroFieldOperator is very similar to testSingleFieldOperator, except that
2328      * there is no use of field indices and fewer types of headers. See the latter's
2329      * documentation and the CountOperator unit tests for examples.
2330      */
2331     void testZeroFieldOperator(OperatorClass : ZeroFieldOperator)
2332         (const char[][][] splitFile, string defaultHeader, const char[][] expectedValues)
2333     {
2334         import std.format : format;
2335         import std.array : appender;
2336         import std.string : chomp;
2337         import std.traits : EnumMembers;
2338 
2339         auto numFields = (splitFile[0]).length;
2340 
2341         assert(splitFile.length + 1 == expectedValues.length,
2342                format("[testZeroFieldOperator] Need one more expected value than number of rows. headerSuffix: %s",
2343                       defaultHeader));
2344 
2345         /* printOptions - Not used these tests, but needed for API calls. */
2346         auto printOptions = SummarizerPrintOptions('#', '|');
2347 
2348         /* Missing policy doesn't apply to zero field operators, but need the object for the summarizer. */
2349         auto missingPolicy = new MissingFieldPolicy;
2350 
2351         /* An input header line. */
2352         string[] inputHeaderLine = new string[numFields];
2353         foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string;
2354 
2355         auto customOutputFieldHeader = "custom";
2356 
2357         enum HeaderUsecase {
2358             HeaderLine_DefaultHeader,
2359             HeaderLine_CustomHeader,
2360             NoHeaderLine_DefaultHeader,
2361             NoHeaderLine_CustomHeader,
2362             NoHeaderLine_NoOutputHeader,
2363         }
2364 
2365         string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected)
2366         {
2367             return format("[testZeroFieldOperator] Unexpected header. Operator: %s; Usecase: %s;  Actual: '%s';  Expected: '%s'",
2368                           op.name, hc, actual, expected);
2369         }
2370 
2371         string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex,
2372                                   const char[] actual, const char[] expected)
2373         {
2374             return format("[testZeroFieldOperator] Unexpected value. Operator: %s; Usecase: %s;  RowIndex: %d\n    Actual: '%s';  Expected: '%s'",
2375                           op.name, hc, rowIndex, actual, expected);
2376         }
2377 
2378         /* Run the logic for each header use case. */
2379         foreach (hc; EnumMembers!HeaderUsecase)
2380         {
2381             bool hasInputHeader = (
2382                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2383                 hc == HeaderUsecase.HeaderLine_CustomHeader
2384                 );
2385             bool hasOutputHeader = (
2386                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2387                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2388                 hc == HeaderUsecase.NoHeaderLine_DefaultHeader ||
2389                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2390                 );
2391             bool hasCustomHeader = (
2392                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2393                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2394                 );
2395 
2396             if (hasCustomHeader) assert(hasOutputHeader);
2397 
2398             auto op = new OperatorClass();
2399 
2400             if (hasCustomHeader)
2401             {
2402                 if (!op.allowCustomHeader) continue;   // Custom header not support by this operator
2403                 op.setCustomHeader(customOutputFieldHeader);
2404             }
2405 
2406             Operator[] operatorArray;
2407             operatorArray ~= op;
2408 
2409             auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy);
2410             summarizer.setOperators(inputRangeObject(operatorArray));
2411             if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine);
2412 
2413             if (hasOutputHeader)
2414             {
2415                 /* Write the header line. Note that this is a one-field header, */
2416                 auto headerLineOutput = appender!(char[])();
2417                 summarizer.writeSummaryHeader(headerLineOutput, printOptions);
2418 
2419                 /* Test that the header was generated correctly.
2420                  *
2421                  * Note: Because the output is generated by a Summarizer, it will have a
2422                  * trailing newline. Use chomp to trim it.
2423                  */
2424                 final switch (hc)
2425                 {
2426                 case HeaderUsecase.HeaderLine_DefaultHeader:
2427                 case HeaderUsecase.NoHeaderLine_DefaultHeader:
2428                     assert(headerLineOutput.data.chomp == defaultHeader,
2429                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2430                                                defaultHeader));
2431                     break;
2432                 case HeaderUsecase.HeaderLine_CustomHeader:
2433                 case HeaderUsecase.NoHeaderLine_CustomHeader:
2434                     assert(headerLineOutput.data.chomp == customOutputFieldHeader,
2435                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2436                                                customOutputFieldHeader));
2437                     break;
2438                 case HeaderUsecase.NoHeaderLine_NoOutputHeader:
2439                     break;
2440                 }
2441 
2442             }
2443 
2444             /* For each line, process the line, generate the output, and test that the
2445              * value is correct. Start with the empty file case.
2446              */
2447             foreach (i, const char[] expected; expectedValues)
2448             {
2449                 if (i > 0) summarizer.processNextLine(splitFile[i - 1]);
2450                 auto summaryLineOutput = appender!(char[])();
2451                 summarizer.writeSummaryBody(summaryLineOutput, printOptions);
2452                 assert(summaryLineOutput.data.chomp == expected,
2453                        valueAssertMessage(operatorArray[0], hc, i,
2454                                           summaryLineOutput.data.chomp, expectedValues[i]));
2455             }
2456         }
2457     }
2458 }
2459 
2460 /* Specific operators.
2461  *
2462  * Notes:
2463  * - The 'Calculator' inner classes are 'static'. This means inner class instances do not
2464  *   keep a reference to the context of the outer class. In exchange, Calculator instances
2465  *   need to hold all needed state, typically the field index they are summarizing.
2466  */
2467 
2468 /** CountOperator counts the number of occurrences of each unique key, or the number of
2469  * input lines if there is no unique key.
2470  *
2471  * CountOperator differs from most other operators in that it doesn't summarize a specific
2472  * field on the line. Instead it is summarizing a property of the unique key itself. For
2473  * this reason it doesn't derive from SingleFieldOperator.
2474  */
2475 class CountOperator : ZeroFieldOperator
2476 {
2477     this()
2478     {
2479         super("count");
2480     }
2481 
2482     final override ZeroFieldCalculator makeCalculator()
2483     {
2484         return new CountCalculator();
2485     }
2486 
2487     static class CountCalculator : ZeroFieldCalculator
2488     {
2489         private size_t _count = 0;
2490 
2491         final override void processNextEntry()
2492         {
2493             _count++;
2494         }
2495 
2496         final override string calculate(const ref SummarizerPrintOptions printOptions)
2497         {
2498             return printOptions.formatNumber(_count);
2499         }
2500     }
2501 }
2502 
2503 unittest // CountOperator
2504 {
2505     auto col1File = [["10"], ["9.5"], ["11"]];
2506     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
2507     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
2508 
2509     testZeroFieldOperator!CountOperator(col1File, "count", ["0", "1", "2", "3"]);
2510     testZeroFieldOperator!CountOperator(col2File, "count", ["0", "1", "2", "3"]);
2511     testZeroFieldOperator!CountOperator(col3File, "count", ["0", "1", "2", "3"]);
2512 }
2513 
2514 /** RetainOperator retains the first occurrence of a field, without changing the header.
2515  *
2516  * RetainOperator is intended for fields where the value is expected to be the same for
2517  * all occurrences of the unique key, and the goal is to pass the value through unchanged.
2518  * It is like FirstOperator, except that the original header is preserved. The original
2519  * header preservation is setup in the call to the SingleFieldOperation constructor.
2520  *
2521  * Notes:
2522  * - An option to signal an error if multiple values are encountered might be useful.
2523  */
2524 class RetainOperator : SingleFieldOperator
2525 {
2526     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2527     {
2528         super("retain", fieldIndex, missingPolicy, No.useHeaderSuffix, No.allowCustomHeader);
2529     }
2530 
2531     final override SingleFieldCalculator makeCalculator()
2532     {
2533         return new RetainCalculator(fieldIndex);
2534     }
2535 
2536     class RetainCalculator : SingleFieldCalculator
2537     {
2538         private bool _done = false;
2539         private string _value = "";
2540 
2541         this(size_t fieldIndex)
2542         {
2543             super(fieldIndex);
2544         }
2545 
2546         final override RetainOperator getOperator()
2547         {
2548             return this.outer;
2549         }
2550 
2551         final override void processNextField(const char[] nextField)
2552         {
2553             if (!_done)
2554             {
2555                 _value = nextField.to!string;
2556                 _done = true;
2557             }
2558         }
2559 
2560         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2561        {
2562             return _value;
2563         }
2564     }
2565 }
2566 
2567 unittest // RetainOperator
2568 {
2569     auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]];
2570     auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]];
2571     auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]];
2572 
2573     testSingleFieldOperator!RetainOperator(col1File, 0, "", ["", "r1c1", "r1c1", "r1c1"]);
2574     testSingleFieldOperator!RetainOperator(col2File, 0, "", ["", "r1c1", "r1c1", "r1c1"]);
2575     testSingleFieldOperator!RetainOperator(col2File, 1, "", ["", "r1c2", "r1c2", "r1c2"]);
2576     testSingleFieldOperator!RetainOperator(col3File, 0, "", ["", "r1c1", "r1c1", "r1c1"]);
2577     testSingleFieldOperator!RetainOperator(col3File, 1, "", ["", "r1c2", "r1c2", "r1c2"]);
2578     testSingleFieldOperator!RetainOperator(col3File, 2, "", ["", "r1c3", "r1c3", "r1c3"]);
2579 
2580     auto col1misFile = [[""], ["r2c1"], ["r3c1"]];
2581     testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "", "r2c1", "r2c1"],
2582                                            new MissingFieldPolicy(true, ""));  // Exclude missing
2583     testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "NA", "NA", "NA"],
2584                                            new MissingFieldPolicy(false, "NA"));  // Replace missing
2585 }
2586 
2587 /** FirstOperator outputs the first value found for the field.
2588  */
2589 class FirstOperator : SingleFieldOperator
2590 {
2591     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2592     {
2593         super("first", fieldIndex, missingPolicy);
2594     }
2595 
2596     final override SingleFieldCalculator makeCalculator()
2597     {
2598         return new FirstCalculator(fieldIndex);
2599     }
2600 
2601     class FirstCalculator : SingleFieldCalculator
2602     {
2603         private bool _done = false;
2604         private string _value = "";
2605 
2606         this(size_t fieldIndex)
2607         {
2608             super(fieldIndex);
2609         }
2610 
2611         final override FirstOperator getOperator()
2612         {
2613             return this.outer;
2614         }
2615 
2616         final override void processNextField(const char[] nextField)
2617         {
2618             if (!_done)
2619             {
2620                 _value = nextField.to!string;
2621                 _done = true;
2622             }
2623         }
2624 
2625         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2626         {
2627             return _value;
2628         }
2629     }
2630 }
2631 
2632 unittest // FirstOperator
2633 {
2634     auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]];
2635     auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]];
2636     auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]];
2637 
2638     testSingleFieldOperator!FirstOperator(col1File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]);
2639     testSingleFieldOperator!FirstOperator(col2File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]);
2640     testSingleFieldOperator!FirstOperator(col2File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]);
2641     testSingleFieldOperator!FirstOperator(col3File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]);
2642     testSingleFieldOperator!FirstOperator(col3File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]);
2643     testSingleFieldOperator!FirstOperator(col3File, 2, "first", ["", "r1c3", "r1c3", "r1c3"]);
2644 
2645     auto col1misFile = [[""], ["r2c1"], ["r3c1"]];
2646     testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "", "r2c1", "r2c1"],
2647                                           new MissingFieldPolicy(true, ""));  // Exclude missing
2648     testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "NA", "NA", "NA"],
2649                                           new MissingFieldPolicy(false, "NA"));  // Replace missing
2650 }
2651 
2652 /** LastOperator outputs the last value found for the field.
2653  */
2654 class LastOperator : SingleFieldOperator
2655 {
2656     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2657     {
2658         super("last", fieldIndex, missingPolicy);
2659     }
2660 
2661     final override SingleFieldCalculator makeCalculator()
2662     {
2663         return new LastCalculator(fieldIndex);
2664     }
2665 
2666     class LastCalculator : SingleFieldCalculator
2667     {
2668         private string _value = "";
2669 
2670         this(size_t fieldIndex)
2671         {
2672             super(fieldIndex);
2673         }
2674 
2675         final override LastOperator getOperator()
2676         {
2677             return this.outer;
2678         }
2679 
2680         final override void processNextField(const char[] nextField)
2681         {
2682             _value = nextField.to!string;
2683         }
2684 
2685         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2686         {
2687             return _value;
2688         }
2689     }
2690 }
2691 
2692 unittest // LastOperator
2693 {
2694     auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]];
2695     auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]];
2696     auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]];
2697 
2698     testSingleFieldOperator!LastOperator(col1File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]);
2699     testSingleFieldOperator!LastOperator(col2File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]);
2700     testSingleFieldOperator!LastOperator(col2File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]);
2701     testSingleFieldOperator!LastOperator(col3File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]);
2702     testSingleFieldOperator!LastOperator(col3File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]);
2703     testSingleFieldOperator!LastOperator(col3File, 2, "last", ["", "r1c3", "r2c3", "r3c3"]);
2704 
2705     auto col1misFile = [[""], ["r2c1"], ["r3c1"]];
2706     testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "", "r2c1", "r3c1"],
2707                                           new MissingFieldPolicy(true, ""));  // Exclude missing
2708     testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "NA", "r2c1", "r3c1"],
2709                                           new MissingFieldPolicy(false, "NA"));  // Replace missing
2710 }
2711 
2712 /* MinOperator output the minimum value for the field. This is a numeric operator.
2713  */
2714 class MinOperator : SingleFieldOperator
2715 {
2716     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2717     {
2718         super("min", fieldIndex, missingPolicy);
2719     }
2720 
2721     final override SingleFieldCalculator makeCalculator()
2722     {
2723         return new MinCalculator(fieldIndex);
2724     }
2725 
2726     class MinCalculator : SingleFieldCalculator
2727     {
2728         private bool _isFirst = true;
2729         private double _value = double.nan;
2730 
2731         this(size_t fieldIndex)
2732         {
2733             super(fieldIndex);
2734         }
2735 
2736         final override MinOperator getOperator()
2737         {
2738             return this.outer;
2739         }
2740 
2741         final override void processNextField(const char[] nextField)
2742         {
2743             double fieldValue = nextField.to!double;
2744             if (_isFirst)
2745             {
2746                 _value = fieldValue;
2747                 _isFirst = false;
2748             }
2749             else if (fieldValue < _value)
2750             {
2751                 _value = fieldValue;
2752             }
2753         }
2754 
2755         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2756         {
2757             return printOptions.formatNumber(_value);
2758         }
2759     }
2760 }
2761 
2762 unittest // MinOperator
2763 {
2764     auto col1File = [["10"], ["9.5"], ["11"]];
2765     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
2766     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
2767 
2768     testSingleFieldOperator!MinOperator(col1File, 0, "min", ["nan", "10", "9.5", "9.5"]);
2769     testSingleFieldOperator!MinOperator(col2File, 0, "min", ["nan", "20", "20", "20"]);
2770     testSingleFieldOperator!MinOperator(col2File, 1, "min", ["nan", "-30", "-30", "-31"]);
2771     testSingleFieldOperator!MinOperator(col3File, 0, "min", ["nan", "9009", "199", "199"]);
2772     testSingleFieldOperator!MinOperator(col3File, 1, "min", ["nan", "9", "0", "0"]);
2773     testSingleFieldOperator!MinOperator(col3File, 2, "min", ["nan", "-4.5", "-4.5", "-4.5"]);
2774 
2775     auto col1misFile = [[""], ["10"], ["-10"]];
2776     testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "nan", "10", "-10"],
2777                                           new MissingFieldPolicy(true, ""));  // Exclude missing
2778     testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "5", "5", "-10"],
2779                                           new MissingFieldPolicy(false, "5"));  // Replace missing
2780 }
2781 
2782 /* MaxOperator output the maximum value for the field. This is a numeric operator.
2783  */
2784 class MaxOperator : SingleFieldOperator
2785 {
2786     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2787     {
2788         super("max", fieldIndex, missingPolicy);
2789     }
2790 
2791     final override SingleFieldCalculator makeCalculator()
2792     {
2793         return new MaxCalculator(fieldIndex);
2794     }
2795 
2796     class MaxCalculator : SingleFieldCalculator
2797     {
2798         private bool _isFirst = true;
2799         private double _value = double.nan;
2800 
2801         this(size_t fieldIndex)
2802         {
2803             super(fieldIndex);
2804         }
2805 
2806         final override MaxOperator getOperator()
2807         {
2808             return this.outer;
2809         }
2810 
2811         final override void processNextField(const char[] nextField)
2812         {
2813             double fieldValue = nextField.to!double;
2814             if (_isFirst)
2815             {
2816                 _value = fieldValue;
2817                 _isFirst = false;
2818             }
2819             else if (fieldValue > _value)
2820             {
2821                 _value = fieldValue;
2822             }
2823         }
2824 
2825         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2826         {
2827             return printOptions.formatNumber(_value);
2828         }
2829     }
2830 }
2831 
2832 unittest // MaxOperator
2833 {
2834     auto col1File = [["10"], ["9.5"], ["11"]];
2835     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
2836     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
2837 
2838     testSingleFieldOperator!MaxOperator(col1File, 0, "max", ["nan", "10", "10", "11"]);
2839     testSingleFieldOperator!MaxOperator(col2File, 0, "max", ["nan", "20", "21", "22"]);
2840     testSingleFieldOperator!MaxOperator(col2File, 1, "max", ["nan", "-30", "-29", "-29"]);
2841     testSingleFieldOperator!MaxOperator(col3File, 0, "max", ["nan", "9009", "9009", "9009"]);
2842     testSingleFieldOperator!MaxOperator(col3File, 1, "max", ["nan", "9", "9", "9"]);
2843     testSingleFieldOperator!MaxOperator(col3File, 2, "max", ["nan", "-4.5", "-0.5", "12"]);
2844 
2845     auto col1misFile = [[""], ["-10"], ["10"]];
2846     testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "nan", "-10", "10"],
2847                                           new MissingFieldPolicy(true, ""));  // Exclude missing
2848     testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "5", "5", "10"],
2849                                           new MissingFieldPolicy(false, "5"));  // Replace missing
2850 }
2851 
2852 /* RangeOperator outputs the difference between the minimum and maximum values. If there
2853  * is a single value, or all values are the same, the range is zero. This is a numeric
2854  * operator.
2855  */
2856 class RangeOperator : SingleFieldOperator
2857 {
2858     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2859     {
2860         super("range", fieldIndex, missingPolicy);
2861     }
2862 
2863     final override SingleFieldCalculator makeCalculator()
2864     {
2865         return new RangeCalculator(fieldIndex);
2866     }
2867 
2868     class RangeCalculator : SingleFieldCalculator
2869     {
2870         private bool _isFirst = true;
2871         private double _minValue = 0.0;
2872         private double _maxValue = 0.0;
2873 
2874         this(size_t fieldIndex)
2875         {
2876             super(fieldIndex);
2877         }
2878 
2879         final override RangeOperator getOperator()
2880         {
2881             return this.outer;
2882         }
2883 
2884         final override void processNextField(const char[] nextField)
2885         {
2886             double fieldValue = nextField.to!double;
2887             if (_isFirst)
2888             {
2889                 _minValue = _maxValue = fieldValue;
2890                 _isFirst = false;
2891             }
2892             else if (fieldValue > _maxValue)
2893             {
2894                 _maxValue = fieldValue;
2895             }
2896             else if (fieldValue < _minValue)
2897             {
2898                 _minValue = fieldValue;
2899             }
2900         }
2901 
2902         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2903         {
2904             return printOptions.formatNumber(_maxValue - _minValue);
2905         }
2906     }
2907 }
2908 
2909 unittest // RangeOperator
2910 {
2911     auto col1File = [["10"], ["9.5"], ["11"]];
2912     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
2913     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
2914 
2915     testSingleFieldOperator!RangeOperator(col1File, 0, "range", ["0", "0", "0.5", "1.5"]);
2916     testSingleFieldOperator!RangeOperator(col2File, 0, "range", ["0", "0", "1", "2"]);
2917     testSingleFieldOperator!RangeOperator(col2File, 1, "range", ["0", "0", "1", "2"]);
2918     testSingleFieldOperator!RangeOperator(col3File, 0, "range", ["0", "0", "8810", "8810"]);
2919     testSingleFieldOperator!RangeOperator(col3File, 1, "range", ["0", "0", "9", "9"]);
2920     testSingleFieldOperator!RangeOperator(col3File, 2, "range", ["0", "0", "4", "16.5"]);
2921 
2922     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]];
2923     testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "0", "0", "0.5", "1.5"],
2924                                           new MissingFieldPolicy(true, ""));  // Exclude missing
2925     testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "4.5", "4.5", "4.5", "5.5"],
2926                                           new MissingFieldPolicy(false, "5.5"));  // Replace missing
2927 }
2928 
2929 /* SumOperator produces the sum of all the values. This is a numeric operator.
2930  */
2931 class SumOperator : SingleFieldOperator
2932 {
2933     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2934     {
2935         super("sum", fieldIndex, missingPolicy);
2936     }
2937 
2938     final override SingleFieldCalculator makeCalculator()
2939     {
2940         return new SumCalculator(fieldIndex);
2941     }
2942 
2943     class SumCalculator : SingleFieldCalculator
2944     {
2945         private double _total = 0.0;
2946 
2947         this(size_t fieldIndex)
2948         {
2949             super(fieldIndex);
2950         }
2951 
2952         final override SumOperator getOperator()
2953         {
2954             return this.outer;
2955         }
2956 
2957         final override void processNextField(const char[] nextField)
2958         {
2959             _total += nextField.to!double;
2960         }
2961 
2962         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2963         {
2964             return printOptions.formatNumber(_total);
2965         }
2966     }
2967 }
2968 
2969 unittest // SumOperator
2970 {
2971     auto col1File = [["10"], ["9.5"], ["11"]];
2972     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
2973     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
2974 
2975     testSingleFieldOperator!SumOperator(col1File, 0, "sum", ["0", "10", "19.5", "30.5"]);
2976     testSingleFieldOperator!SumOperator(col2File, 0, "sum", ["0", "20", "41", "63"]);
2977     testSingleFieldOperator!SumOperator(col2File, 1, "sum", ["0", "-30", "-59", "-90"]);
2978     testSingleFieldOperator!SumOperator(col3File, 0, "sum", ["0", "9009", "9208", "12211"]);
2979     testSingleFieldOperator!SumOperator(col3File, 1, "sum", ["0", "9", "9", "9.2"]);
2980     testSingleFieldOperator!SumOperator(col3File, 2, "sum", ["0", "-4.5", "-5", "7"]);
2981 
2982     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]];
2983     testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "0", "10", "10", "19.5", "30.5"],
2984                                           new MissingFieldPolicy(true, ""));  // Exclude missing
2985     testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "1.5", "11.5", "13", "22.5", "33.5"],
2986                                           new MissingFieldPolicy(false, "1.5"));  // Replace missing
2987 }
2988 
2989 /* MeanOperator produces the mean (average) of all the values. This is a numeric operator.
2990  */
2991 class MeanOperator : SingleFieldOperator
2992 {
2993     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2994     {
2995         super("mean", fieldIndex, missingPolicy);
2996     }
2997 
2998     final override SingleFieldCalculator makeCalculator()
2999     {
3000         return new MeanCalculator(fieldIndex);
3001     }
3002 
3003     class MeanCalculator : SingleFieldCalculator
3004     {
3005         private double _total = 0.0;
3006         private size_t _count = 0;
3007 
3008         this(size_t fieldIndex)
3009         {
3010             super(fieldIndex);
3011         }
3012 
3013         final override MeanOperator getOperator()
3014         {
3015             return this.outer;
3016         }
3017 
3018         final override void processNextField(const char[] nextField)
3019         {
3020             _total += nextField.to!double;
3021             _count++;
3022         }
3023 
3024         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3025         {
3026             return printOptions.formatNumber(
3027                 (_count > 0) ? (_total / _count.to!double) : double.nan);
3028         }
3029     }
3030 }
3031 
3032 unittest // MeanOperator
3033 {
3034     auto col1File = [["10"], ["9.5"], ["7.5"]];
3035     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3036     auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]];
3037 
3038     testSingleFieldOperator!MeanOperator(col1File, 0, "mean", ["nan", "10", "9.75", "9"]);
3039     testSingleFieldOperator!MeanOperator(col2File, 0, "mean", ["nan", "20", "20.5", "21"]);
3040     testSingleFieldOperator!MeanOperator(col2File, 1, "mean", ["nan", "-30", "-29.5", "-30"]);
3041     testSingleFieldOperator!MeanOperator(col3File, 0, "mean", ["nan", "9009", "4509", "4509"]);
3042     testSingleFieldOperator!MeanOperator(col3File, 1, "mean", ["nan", "9", "4.5", "2"]);
3043     testSingleFieldOperator!MeanOperator(col3File, 2, "mean", ["nan", "-4.5", "-3", "2"]);
3044 
3045     auto col1misFile = [[""], ["6"], [""], ["14"], ["40"]];
3046     testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "nan", "6", "6", "10", "20"],
3047                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3048     testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "0", "3", "2", "5", "12"],
3049                                           new MissingFieldPolicy(false, "0"));  // Replace missing
3050 }
3051 
3052 /* MedianOperator produces the median of all the values. This is a numeric operator.
3053  *
3054  * All the field values are stored in memory as part of this calculation. This is
3055  * handled by unique key value lists.
3056  */
3057 class MedianOperator : SingleFieldOperator
3058 {
3059     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3060     {
3061         super("median", fieldIndex, missingPolicy);
3062         setSaveFieldValuesNumeric();
3063     }
3064 
3065     final override SingleFieldCalculator makeCalculator()
3066     {
3067         return new MedianCalculator(fieldIndex);
3068     }
3069 
3070     class MedianCalculator : SingleFieldCalculator
3071     {
3072         this(size_t fieldIndex)
3073         {
3074             super(fieldIndex);
3075         }
3076 
3077         final override MedianOperator getOperator()
3078         {
3079             return this.outer;
3080         }
3081 
3082         /* Work is done by saving the field values. */
3083         final override void processNextField(const char[] nextField)
3084         { }
3085 
3086         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3087         {
3088             return printOptions.formatNumber(valuesLists.numericValuesMedian(fieldIndex));
3089         }
3090     }
3091 }
3092 
3093 unittest // MedianOperator
3094 {
3095     auto col1File = [["10"], ["9.5"], ["7.5"]];
3096     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3097     auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]];
3098 
3099     testSingleFieldOperator!MedianOperator(col1File, 0, "median", ["nan", "10", "9.75", "9.5"]);
3100     testSingleFieldOperator!MedianOperator(col2File, 0, "median", ["nan", "20", "20.5", "21"]);
3101     testSingleFieldOperator!MedianOperator(col2File, 1, "median", ["nan", "-30", "-29.5", "-30"]);
3102     testSingleFieldOperator!MedianOperator(col3File, 0, "median", ["nan", "9009", "4509", "4509"]);
3103     testSingleFieldOperator!MedianOperator(col3File, 1, "median", ["nan", "9", "4.5", "0"]);
3104     testSingleFieldOperator!MedianOperator(col3File, 2, "median", ["nan", "-4.5", "-3", "-1.5"]);
3105 
3106     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]];
3107     testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "nan", "10", "10", "9.75", "9.5"],
3108                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3109     testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "0", "5", "0", "4.75", "7.5"],
3110                                           new MissingFieldPolicy(false, "0"));  // Replace missing
3111 }
3112 
3113 /* QuantileOperator produces the value representing the data at a cummulative probability.
3114  * This is a numeric operation.
3115  *
3116  * As an example, quantiles might be produced for the 0.25, 0.5, and 0.75 probabilities
3117  * (alternately, the 25th, 50th, and 75th percentile ranks, the 50th percentile being the
3118  * median). Data is sorted is ascending order. This operator takes one percentile, but it
3119  * is common to generate multiple quantile ranks for the same field when summarizing.
3120  *
3121  * All the field's values are stored in memory as part of this calculation. This is
3122  * handled by unique key value lists.
3123  */
3124 class QuantileOperator : SingleFieldOperator
3125 {
3126     private double _prob;
3127 
3128     this(size_t fieldIndex, MissingFieldPolicy missingPolicy, double probability)
3129     {
3130         assert(0.0 <= probability && probability <= 1.0);
3131         import std.format : format;
3132 
3133         string header = (probability == 0.0) ? "pct0" : format("pct%02g", probability * 100.0);
3134         super(header, fieldIndex, missingPolicy);
3135         _prob = probability;
3136         setSaveFieldValuesNumeric();
3137     }
3138 
3139     final override SingleFieldCalculator makeCalculator()
3140     {
3141         return new QuantileCalculator(fieldIndex);
3142     }
3143 
3144     class QuantileCalculator : SingleFieldCalculator
3145     {
3146         this(size_t fieldIndex)
3147         {
3148             super(fieldIndex);
3149         }
3150 
3151         final override QuantileOperator getOperator()
3152         {
3153             return this.outer;
3154         }
3155 
3156         /* Work is done by saving the field values. */
3157         final override void processNextField(const char[] nextField)
3158         { }
3159 
3160         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3161         {
3162             import tsv_numerics : quantile;
3163             return printOptions.formatNumber(
3164                 quantile(this.outer._prob, valuesLists.numericValuesSorted(fieldIndex)));
3165         }
3166     }
3167 }
3168 
3169 unittest // QuantileOperator
3170 {
3171     auto col1File = [["10"], ["9.5"], ["7.5"]];
3172     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3173     auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]];
3174 
3175     auto defaultMissing = new MissingFieldPolicy;
3176 
3177     /* Same as the median tests. */
3178     testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct50", ["nan", "10", "9.75", "9.5"], defaultMissing, 0.50);
3179     testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct50", ["nan", "20", "20.5", "21"], defaultMissing, 0.50);
3180     testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct50", ["nan", "-30", "-29.5", "-30"], defaultMissing, 0.50);
3181     testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct50", ["nan", "9009", "4509", "4509"], defaultMissing, 0.50);
3182     testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct50", ["nan", "9", "4.5", "0"], defaultMissing, 0.50);
3183     testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct50", ["nan", "-4.5", "-3", "-1.5"], defaultMissing, 0.50);
3184 
3185     /* The extremes (0, 1), are min and max. */
3186     testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct0", ["nan", "10", "9.5", "7.5"], defaultMissing, 0.0);
3187     testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct0", ["nan", "20", "20", "20"], defaultMissing, 0.0);
3188     testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct0", ["nan", "-30", "-30", "-31"], defaultMissing, 0.0);
3189     testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct0", ["nan", "9009", "9", "9"], defaultMissing, 0.0);
3190     testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct0", ["nan", "9", "0", "-3"], defaultMissing, 0.0);
3191     testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct0", ["nan", "-4.5", "-4.5", "-4.5"], defaultMissing, 0.0);
3192 
3193     testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct100", ["nan", "10", "10", "10"], defaultMissing, 1.0);
3194     testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct100", ["nan", "20", "21", "22"], defaultMissing, 1.0);
3195     testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct100", ["nan", "-30", "-29", "-29"], defaultMissing, 1.0);
3196     testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct100", ["nan", "9009", "9009", "9009"], defaultMissing, 1.0);
3197     testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct100", ["nan", "9", "9", "9"], defaultMissing, 1.0);
3198     testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct100", ["nan", "-4.5", "-1.5", "12"], defaultMissing, 1.0);
3199 
3200     /* For missing policies, re-use the median tests. */
3201     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]];
3202     testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "nan", "10", "10", "9.75", "9.5"],
3203                                                  new MissingFieldPolicy(true, ""), 0.5);  // Exclude missing
3204     testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "0", "5", "0", "4.75", "7.5"],
3205                                                  new MissingFieldPolicy(false, "0"), 0.5);  // Replace missing
3206 }
3207 
3208 /* MadOperator produces the median absolute deviation from the median. This is a numeric
3209  * operation.
3210  *
3211  * The result is the raw MAD value, without a normalization applied.
3212  *
3213  * All the field values are stored in memory as part of this calculation. This is
3214  * handled by unique key value lists.
3215  */
3216 class MadOperator : SingleFieldOperator
3217 {
3218     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3219     {
3220         super("mad", fieldIndex, missingPolicy);
3221         setSaveFieldValuesNumeric();
3222     }
3223 
3224     final override SingleFieldCalculator makeCalculator()
3225     {
3226         return new MadCalculator(fieldIndex);
3227     }
3228 
3229     class MadCalculator : SingleFieldCalculator
3230     {
3231         this(size_t fieldIndex)
3232         {
3233             super(fieldIndex);
3234         }
3235 
3236         final override MadOperator getOperator()
3237         {
3238             return this.outer;
3239         }
3240 
3241         /* Work is done by saving the field values. */
3242         final override void processNextField(const char[] nextField)
3243         { }
3244 
3245         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3246         {
3247             import std.math : abs;
3248             import tsv_numerics : rangeMedian;
3249 
3250             auto median = valuesLists.numericValuesMedian(fieldIndex);
3251             auto values = valuesLists.numericValues(fieldIndex);
3252             auto medianDevs = new double[values.length];
3253             foreach (int i, double v; values)
3254                 medianDevs[i] = abs(v - median);
3255 
3256             return printOptions.formatNumber(medianDevs.rangeMedian);
3257         }
3258     }
3259 }
3260 
3261 unittest // MadOperator
3262 {
3263     auto col1File = [["10"], ["15"], ["20"], ["25"], ["30"]];
3264     auto col2File = [["2", "50"], ["2", "51"], ["2", "52"]];
3265     auto col3File = [["16", "8", "-4"], ["8", "8", "-2"], ["8", "16", "0"]];
3266 
3267     testSingleFieldOperator!MadOperator(col1File, 0, "mad", ["nan", "0", "2.5", "5", "5", "5"]);
3268     testSingleFieldOperator!MadOperator(col2File, 0, "mad", ["nan", "0", "0", "0"]);
3269     testSingleFieldOperator!MadOperator(col2File, 1, "mad", ["nan", "0", "0.5", "1"]);
3270     testSingleFieldOperator!MadOperator(col3File, 0, "mad", ["nan", "0", "4", "0"]);
3271     testSingleFieldOperator!MadOperator(col3File, 1, "mad", ["nan", "0", "0", "0"]);
3272     testSingleFieldOperator!MadOperator(col3File, 2, "mad", ["nan", "0", "1", "2"]);
3273 
3274     auto col1misFile = [[""], ["16"], [""], ["32"], ["-4"]];
3275     testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "nan", "0", "0", "8", "16"],
3276                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3277     testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "0", "8", "0", "8", "4"],
3278                                           new MissingFieldPolicy(false, "0"));  // Replace missing
3279 }
3280 
3281 class VarianceOperator : SingleFieldOperator
3282 {
3283     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3284     {
3285         super("var", fieldIndex, missingPolicy);
3286     }
3287 
3288     final override SingleFieldCalculator makeCalculator()
3289     {
3290         return new VarianceCalculator(fieldIndex);
3291     }
3292 
3293     class VarianceCalculator : SingleFieldCalculator
3294     {
3295         private double _count = 0.0;
3296         private double _mean = 0.0;
3297         private double _m2 = 0.0;     // Sum of squares of differences from current mean
3298 
3299         this(size_t fieldIndex)
3300         {
3301             super(fieldIndex);
3302         }
3303 
3304         final override VarianceOperator getOperator()
3305         {
3306             return this.outer;
3307         }
3308 
3309         final override void processNextField(const char[] nextField)
3310         {
3311             _count += 1.0;
3312             double fieldValue = nextField.to!double;
3313             double delta = fieldValue - _mean;
3314             _mean += delta / _count;
3315             _m2 += delta * (fieldValue - _mean);
3316         }
3317 
3318         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3319         {
3320             return printOptions.formatNumber(
3321                 (_count >= 2.0) ? (_m2 / (_count - 1.0)) : double.nan);
3322         }
3323     }
3324 }
3325 
3326 unittest // VarianceOperator
3327 {
3328     auto col1File = [["5"], ["10"], ["15"]];
3329     auto col2File = [["-5", "-5"], ["-10", "0"], ["-15", "5"]];
3330     auto col3File = [["1", "2", "100"], ["2", "3", "100"], ["3", "4", "103"]];
3331 
3332     testSingleFieldOperator!VarianceOperator(col1File, 0, "var", ["nan", "nan", "12.5", "25"]);
3333     testSingleFieldOperator!VarianceOperator(col2File, 0, "var", ["nan", "nan", "12.5", "25"]);
3334     testSingleFieldOperator!VarianceOperator(col2File, 1, "var", ["nan", "nan", "12.5", "25"]);
3335     testSingleFieldOperator!VarianceOperator(col3File, 0, "var", ["nan", "nan", "0.5", "1"]);
3336     testSingleFieldOperator!VarianceOperator(col3File, 1, "var", ["nan", "nan", "0.5", "1"]);
3337     testSingleFieldOperator!VarianceOperator(col3File, 2, "var", ["nan", "nan", "0", "3"]);
3338 
3339     auto col1misFile = [["5"], ["10"], [""]];
3340     testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "12.5"],
3341                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3342     testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "25"],
3343                                           new MissingFieldPolicy(false, "15"));  // Replace missing
3344 }
3345 
3346 class StDevOperator : SingleFieldOperator
3347 {
3348     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3349     {
3350         super("stdev", fieldIndex, missingPolicy);
3351     }
3352 
3353     final override SingleFieldCalculator makeCalculator()
3354     {
3355         return new StDevCalculator(fieldIndex);
3356     }
3357 
3358     class StDevCalculator : SingleFieldCalculator
3359     {
3360         private double _count = 0.0;
3361         private double _mean = 0.0;
3362         private double _m2 = 0.0;     // Sum of squares of differences from current mean
3363 
3364         this(size_t fieldIndex)
3365         {
3366             super(fieldIndex);
3367         }
3368 
3369         final override StDevOperator getOperator()
3370         {
3371             return this.outer;
3372         }
3373 
3374         final override void processNextField(const char[] nextField)
3375         {
3376             _count += 1.0;
3377             double fieldValue = nextField.to!double;
3378             double delta = fieldValue - _mean;
3379             _mean += delta / _count;
3380             _m2 += delta * (fieldValue - _mean);
3381         }
3382 
3383         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3384         {
3385             import std.math : sqrt;
3386             return printOptions.formatNumber(
3387                 (_count >= 2.0) ? (_m2 / (_count - 1.0)).sqrt : double.nan);
3388         }
3389     }
3390 }
3391 
3392 /* StDevOperator unit tests - These would be improved with a tolerance option.
3393  */
3394 unittest
3395 {
3396     auto col1File = [["1"], ["4"], ["7"]];
3397     auto col2File = [["3", "3"], ["3", "9"], ["7", "15"]];
3398     auto col3File = [["11", "10", "10"], ["24", "22", "25"], ["37", "34", "40"]];
3399 
3400     testSingleFieldOperator!StDevOperator(col1File, 0, "stdev", ["nan", "nan", "2.12132034356", "3"]);
3401     testSingleFieldOperator!StDevOperator(col2File, 0, "stdev", ["nan", "nan", "0", "2.30940107676"]);
3402     testSingleFieldOperator!StDevOperator(col2File, 1, "stdev", ["nan", "nan", "4.24264068712", "6"]);
3403     testSingleFieldOperator!StDevOperator(col3File, 0, "stdev", ["nan", "nan", "9.19238815543", "13"]);
3404     testSingleFieldOperator!StDevOperator(col3File, 1, "stdev", ["nan", "nan", "8.48528137424", "12"]);
3405     testSingleFieldOperator!StDevOperator(col3File, 2, "stdev", ["nan", "nan", "10.6066017178", "15"]);
3406 
3407     auto col1misFile = [["1"], ["4"], [""]];
3408     testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "2.12132034356"],
3409                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3410     testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "3"],
3411                                           new MissingFieldPolicy(false, "7"));  // Replace missing
3412 }
3413 
3414 /* UniqueCountOperator generates the number of unique values. Unique values are
3415  * based on exact text match calculation, not a numeric comparison.
3416  *
3417  * All the unique field values are stored in memory as part of this calculation.
3418  */
3419 class UniqueCountOperator : SingleFieldOperator
3420 {
3421     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3422     {
3423         super("unique_count", fieldIndex, missingPolicy);
3424     }
3425 
3426     final override SingleFieldCalculator makeCalculator()
3427     {
3428         return new UniqueCountCalculator(fieldIndex);
3429     }
3430 
3431     class UniqueCountCalculator : SingleFieldCalculator
3432     {
3433         private bool[string] _values;
3434 
3435         this(size_t fieldIndex)
3436         {
3437             super(fieldIndex);
3438         }
3439 
3440         final override UniqueCountOperator getOperator()
3441         {
3442             return this.outer;
3443         }
3444 
3445         final override void processNextField(const char[] nextField)
3446         {
3447             if (nextField !in _values) _values[nextField.to!string] = true;
3448         }
3449 
3450         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3451         {
3452             return printOptions.formatNumber(_values.length);
3453         }
3454     }
3455 }
3456 
3457 unittest // UniqueCount
3458 {
3459     auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]];
3460     auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]];
3461     auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]];
3462 
3463     testSingleFieldOperator!UniqueCountOperator(col1File, 0, "unique_count", ["0", "1", "2", "3", "3", "3", "3", "3", "4"]);
3464     testSingleFieldOperator!UniqueCountOperator(col2File, 0, "unique_count", ["0", "1", "2", "2"]);
3465     testSingleFieldOperator!UniqueCountOperator(col2File, 1, "unique_count", ["0", "1", "1", "2"]);
3466     testSingleFieldOperator!UniqueCountOperator(col3File, 0, "unique_count", ["0", "1", "2", "3"]);
3467     testSingleFieldOperator!UniqueCountOperator(col3File, 1, "unique_count", ["0", "1", "2", "2"]);
3468     testSingleFieldOperator!UniqueCountOperator(col3File, 2, "unique_count", ["0", "1", "2", "3"]);
3469 
3470     auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]];
3471     testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "0", "1", "1", "2", "3", "3", "3", "3", "3", "4"],
3472                                                 new MissingFieldPolicy(true, ""));  // Exclude missing
3473 
3474 
3475     testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "1", "2", "2", "3", "4", "4", "4", "4", "4", "5"],
3476                                                 new MissingFieldPolicy(false, "XYZ"));  // Replace missing
3477 }
3478 
3479 /* MissingCountOperator generates the number of missing values. This overrides
3480  * the global missingFieldsPolicy.
3481  */
3482 class MissingCountOperator : SingleFieldOperator
3483 {
3484     private MissingFieldPolicy _globalMissingPolicy;
3485 
3486     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3487     {
3488         _globalMissingPolicy = missingPolicy;
3489         super("missing_count", fieldIndex, new MissingFieldPolicy(false, ""));
3490     }
3491 
3492     final override SingleFieldCalculator makeCalculator()
3493     {
3494         return new MissingCountCalculator(fieldIndex);
3495     }
3496 
3497     class MissingCountCalculator : SingleFieldCalculator
3498     {
3499         private size_t _missingCount = 0;
3500 
3501         this(size_t fieldIndex)
3502         {
3503             super(fieldIndex);
3504         }
3505 
3506         final override MissingCountOperator getOperator()
3507         {
3508             return this.outer;
3509         }
3510 
3511         final override void processNextField(const char[] nextField)
3512         {
3513             if (this.outer._globalMissingPolicy.isMissingField(nextField)) _missingCount++;
3514         }
3515 
3516         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3517         {
3518             return printOptions.formatNumber(_missingCount);
3519         }
3520     }
3521 }
3522 
3523 unittest // MissingCount
3524 {
3525     auto col1File = [["a"], ["b"], [""], [" "], [""]];
3526     auto col2File = [["abc", ""], ["", ""], ["def", ""]];
3527     auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]];
3528 
3529     testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"]);
3530     testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"]);
3531     testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"]);
3532     testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"]);
3533     testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"]);
3534     testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"]);
3535 
3536     auto excludeMissing = new MissingFieldPolicy(true, "");
3537     auto replaceMissing = new MissingFieldPolicy(false, "X");
3538 
3539     testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], excludeMissing);
3540     testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], excludeMissing);
3541     testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], excludeMissing);
3542     testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], excludeMissing);
3543     testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], excludeMissing);
3544     testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], excludeMissing);
3545 
3546     testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], replaceMissing);
3547     testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], replaceMissing);
3548     testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], replaceMissing);
3549     testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], replaceMissing);
3550     testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], replaceMissing);
3551     testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], replaceMissing);
3552 }
3553 
3554 /* NotMissingCountOperator generates the number of not-missing values. This overrides
3555  * the global missingFieldsPolicy.
3556  */
3557 class NotMissingCountOperator : SingleFieldOperator
3558 {
3559     private MissingFieldPolicy _globalMissingPolicy;
3560 
3561     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3562     {
3563         _globalMissingPolicy = missingPolicy;
3564         super("not_missing_count", fieldIndex, new MissingFieldPolicy(false, ""));
3565     }
3566 
3567     final override SingleFieldCalculator makeCalculator()
3568     {
3569         return new NotMissingCountCalculator(fieldIndex);
3570     }
3571 
3572     class NotMissingCountCalculator : SingleFieldCalculator
3573     {
3574         private size_t _notMissingCount = 0;
3575 
3576         this(size_t fieldIndex)
3577         {
3578             super(fieldIndex);
3579         }
3580 
3581         final override NotMissingCountOperator getOperator()
3582         {
3583             return this.outer;
3584         }
3585 
3586         final override void processNextField(const char[] nextField)
3587         {
3588             if (!this.outer._globalMissingPolicy.isMissingField(nextField)) _notMissingCount++;
3589         }
3590 
3591         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3592         {
3593             return printOptions.formatNumber(_notMissingCount);
3594         }
3595     }
3596 }
3597 
3598 unittest // NotMissingCount
3599 {
3600     auto col1File = [["a"], ["b"], [""], [" "], [""]];
3601     auto col2File = [["abc", ""], ["", ""], ["def", ""]];
3602     auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]];
3603 
3604     testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"]);
3605     testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"]);
3606     testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"]);
3607     testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"]);
3608     testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"]);
3609     testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"]);
3610 
3611     auto excludeMissing = new MissingFieldPolicy(true, "");
3612     auto replaceMissing = new MissingFieldPolicy(false, "X");
3613 
3614     testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], excludeMissing);
3615     testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], excludeMissing);
3616     testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], excludeMissing);
3617     testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], excludeMissing);
3618     testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], excludeMissing);
3619     testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], excludeMissing);
3620 
3621     testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], replaceMissing);
3622     testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], replaceMissing);
3623     testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], replaceMissing);
3624     testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], replaceMissing);
3625     testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], replaceMissing);
3626     testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], replaceMissing);
3627 }
3628 
3629 /* ModeOperator outputs the most frequent value seen. In the event of a tie, the
3630  * first value seen is produced.
3631  *
3632  * All the field values are stored in memory as part of this calculation.
3633  *
3634  */
3635 class ModeOperator : SingleFieldOperator
3636 {
3637     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3638     {
3639         super("mode", fieldIndex, missingPolicy);
3640     }
3641 
3642     final override SingleFieldCalculator makeCalculator()
3643     {
3644         return new ModeCalculator(fieldIndex);
3645     }
3646 
3647     class ModeCalculator : SingleFieldCalculator
3648     {
3649         private size_t[string] _valueCounts;
3650         private Appender!(string[]) _uniqueValues;
3651 
3652         this(size_t fieldIndex)
3653         {
3654             super(fieldIndex);
3655         }
3656 
3657         final override ModeOperator getOperator()
3658         {
3659             return this.outer;
3660         }
3661 
3662         final override void processNextField(const char[] nextField)
3663         {
3664             auto countPtr = (nextField in _valueCounts);
3665 
3666             if (countPtr is null)
3667             {
3668                 string value = nextField.to!string;
3669                 _uniqueValues.put(value);
3670                 _valueCounts[value] = 1;
3671             }
3672             else
3673             {
3674                 (*countPtr)++;
3675             }
3676         }
3677 
3678         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3679         {
3680             string modeValue = "";
3681             size_t modeCount = 0;
3682 
3683             foreach (value; _uniqueValues.data)
3684             {
3685                 assert(value in _valueCounts);
3686 
3687                 auto count = _valueCounts[value];
3688 
3689                 if (count > modeCount)
3690                 {
3691                     modeValue = value;
3692                     modeCount = count;
3693                 }
3694             }
3695 
3696             return modeValue;
3697         }
3698     }
3699 }
3700 
3701 unittest // ModeOperator
3702 {
3703     auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]];
3704     auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]];
3705     auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]];
3706 
3707     testSingleFieldOperator!ModeOperator(col1File, 0, "mode", ["", "a", "a", "a", "c", "b", "b", "b"]);
3708     testSingleFieldOperator!ModeOperator(col2File, 0, "mode", ["", "abc", "abc", "def"]);
3709     testSingleFieldOperator!ModeOperator(col2File, 1, "mode", ["", "pqr", "pqr", "pqr"]);
3710     testSingleFieldOperator!ModeOperator(col3File, 0, "mode", ["", "1.0", "1.0", "1.0"]);
3711     testSingleFieldOperator!ModeOperator(col3File, 1, "mode", ["", "1", "1", "a"]);
3712     testSingleFieldOperator!ModeOperator(col3File, 2, "mode", ["", "a", "a", "a"]);
3713 
3714     auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]];
3715     testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "", "a", "a", "a", "a", "c", "b", "b"],
3716                                          new MissingFieldPolicy(true, ""));  // Exclude missing
3717 
3718 
3719     testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "X", "X", "X", "X", "X", "X", "X", "b"],
3720                                          new MissingFieldPolicy(false, "X"));  // Replace missing
3721 }
3722 
3723 /* ModeCountOperator outputs the count of the most frequent value seen.
3724  *
3725  * All the field values are stored in memory as part of this calculation.
3726  *
3727  */
3728 class ModeCountOperator : SingleFieldOperator
3729 {
3730     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3731     {
3732         super("mode_count", fieldIndex, missingPolicy);
3733     }
3734 
3735     final override SingleFieldCalculator makeCalculator()
3736     {
3737         return new ModeCountCalculator(fieldIndex);
3738     }
3739 
3740     class ModeCountCalculator : SingleFieldCalculator
3741     {
3742         private size_t[string] _valueCounts;
3743 
3744         this(size_t fieldIndex)
3745         {
3746             super(fieldIndex);
3747         }
3748 
3749         final override ModeCountOperator getOperator()
3750         {
3751             return this.outer;
3752         }
3753 
3754         final override void processNextField(const char[] nextField)
3755         {
3756             auto countPtr = (nextField in _valueCounts);
3757 
3758             if (countPtr is null)
3759             {
3760                 string value = nextField.to!string;
3761                 _valueCounts[value] = 1;
3762             }
3763             else
3764             {
3765                 (*countPtr)++;
3766             }
3767         }
3768 
3769         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3770         {
3771             size_t modeCount = 0;
3772             foreach (count; _valueCounts.byValue) if (count > modeCount) modeCount = count;
3773             return printOptions.formatNumber(modeCount);
3774         }
3775     }
3776 }
3777 
3778 unittest // ModeCountOperator
3779 {
3780     auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]];
3781     auto col2File = [["abc", ""], ["def", ""], ["def", "xyz"]];
3782     auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]];
3783 
3784     testSingleFieldOperator!ModeCountOperator(col1File, 0, "mode_count", ["0", "1", "1", "1", "2", "2", "3", "3"]);
3785     testSingleFieldOperator!ModeCountOperator(col2File, 0, "mode_count", ["0", "1", "1", "2"]);
3786     testSingleFieldOperator!ModeCountOperator(col2File, 1, "mode_count", ["0", "1", "2", "2"]);
3787     testSingleFieldOperator!ModeCountOperator(col3File, 0, "mode_count", ["0", "1", "1", "1"]);
3788     testSingleFieldOperator!ModeCountOperator(col3File, 1, "mode_count", ["0", "1", "1", "2"]);
3789     testSingleFieldOperator!ModeCountOperator(col3File, 2, "mode_count", ["0", "1", "1", "1"]);
3790 
3791     auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]];
3792     testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "0", "1", "1", "1", "1", "2", "2", "3"],
3793                                               new MissingFieldPolicy(true, ""));  // Exclude missing
3794 
3795 
3796     testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "1", "1", "2", "2", "2", "2", "2", "3"],
3797                                               new MissingFieldPolicy(false, "X"));  // Replace missing
3798 }
3799 
3800 /* ValuesOperator outputs each value delimited by an alternate delimiter character.
3801  *
3802  * All the field values are stored in memory as part of this calculation. This is
3803  * handled by unique key value lists.
3804  */
3805 
3806 class ValuesOperator : SingleFieldOperator
3807 {
3808     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3809     {
3810         super("values", fieldIndex, missingPolicy);
3811         setSaveFieldValuesText();
3812     }
3813 
3814     final override SingleFieldCalculator makeCalculator()
3815     {
3816         return new ValuesCalculator(fieldIndex);
3817     }
3818 
3819     class ValuesCalculator : SingleFieldCalculator
3820     {
3821         this(size_t fieldIndex)
3822         {
3823             super(fieldIndex);
3824         }
3825 
3826         final override ValuesOperator getOperator()
3827         {
3828             return this.outer;
3829         }
3830 
3831         /* Work is done by saving the field values. */
3832         final override void processNextField(const char[] nextField)
3833         { }
3834 
3835         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3836         {
3837             return valuesLists.textValues(fieldIndex).join(printOptions.valuesDelimiter);
3838         }
3839     }
3840 }
3841 
3842 unittest // ValuesOperator
3843 {
3844     auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]];
3845     auto col2File = [["", "50"], ["", "51"], ["xyz", "52"]];
3846     auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "---"]];
3847 
3848     testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e|", "a||b|cd|e||a"]);
3849     testSingleFieldOperator!ValuesOperator(col2File, 0, "values", ["", "", "|", "||xyz"]);
3850     testSingleFieldOperator!ValuesOperator(col2File, 1, "values", ["", "50", "50|51", "50|51|52"]);
3851     testSingleFieldOperator!ValuesOperator(col3File, 0, "values", ["", "z", "z|y", "z|y|w"]);
3852     testSingleFieldOperator!ValuesOperator(col3File, 1, "values", ["", "a", "a|ab", "a|ab|ba"]);
3853     testSingleFieldOperator!ValuesOperator(col3File, 2, "values", ["", "-", "-|--", "-|--|---"]);
3854 
3855     testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e|a"],
3856                                          new MissingFieldPolicy(true, ""));  // Exclude missing
3857 
3858 
3859     testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e|X", "a|X|b|cd|e|X|a"],
3860                                          new MissingFieldPolicy(false, "X"));  // Replace missing
3861 }
3862 
3863 /* UniqueValuesOperator outputs each unique value delimited by an alternate delimiter
3864  * character. Values are output in the order seen.
3865  *
3866  * All unique field values are stored in memory as part of this calculation.
3867  *
3868  */
3869 class UniqueValuesOperator : SingleFieldOperator
3870 {
3871     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3872     {
3873         super("unique_values", fieldIndex, missingPolicy);
3874     }
3875 
3876     final override SingleFieldCalculator makeCalculator()
3877     {
3878         return new UniqueValuesCalculator(fieldIndex);
3879     }
3880 
3881     class UniqueValuesCalculator : SingleFieldCalculator
3882     {
3883         private size_t[string] _valuesHash;
3884         private Appender!(string[]) _uniqueValues;
3885 
3886         this(size_t fieldIndex)
3887         {
3888             super(fieldIndex);
3889         }
3890 
3891         final override UniqueValuesOperator getOperator()
3892         {
3893             return this.outer;
3894         }
3895 
3896         final override void processNextField(const char[] nextField)
3897         {
3898             auto ptr = (nextField in _valuesHash);
3899 
3900             if (ptr is null)
3901             {
3902                 string value = nextField.to!string;
3903                 _uniqueValues.put(value);
3904                 _valuesHash[value] = 1;
3905             }
3906         }
3907 
3908         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3909         {
3910             return _uniqueValues.data.join(printOptions.valuesDelimiter);
3911         }
3912     }
3913 }
3914 
3915 unittest // UniqueValuesOperator
3916 {
3917     auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]];
3918     auto col2File = [["", "50"], ["", "50"], ["xyz", "52"]];
3919     auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "-"]];
3920 
3921     testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e", "a||b|cd|e"]);
3922     testSingleFieldOperator!UniqueValuesOperator(col2File, 0, "unique_values", ["", "", "", "|xyz"]);
3923     testSingleFieldOperator!UniqueValuesOperator(col2File, 1, "unique_values", ["", "50", "50", "50|52"]);
3924     testSingleFieldOperator!UniqueValuesOperator(col3File, 0, "unique_values", ["", "z", "z|y", "z|y|w"]);
3925     testSingleFieldOperator!UniqueValuesOperator(col3File, 1, "unique_values", ["", "a", "a|ab", "a|ab|ba"]);
3926     testSingleFieldOperator!UniqueValuesOperator(col3File, 2, "unique_values", ["", "-", "-|--", "-|--"]);
3927 
3928     testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e"],
3929                                                  new MissingFieldPolicy(true, ""));  // Exclude missing
3930 
3931 
3932     testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e", "a|X|b|cd|e"],
3933                                                  new MissingFieldPolicy(false, "X"));  // Replace missing
3934 }