tsv_utils.tsv_summarize source code

1 /**
2 Command line tool that reads TSV files and summarizes field values associated with
3 equivalent keys.
4 
5 Copyright (c) 2016-2021, eBay Inc.
6 Initially written by Jon Degenhardt
7 
8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
9 */
10 module tsv_utils.tsv_summarize;
11 
12 import std.algorithm : all, any, canFind, each, find, findSplit, map, joiner, splitter;
13 import std.array : join;
14 import std.conv : to;
15 import std.exception : enforce;
16 import std.format : format;
17 import std.range;
18 import std.stdio;
19 import std.typecons : tuple;
20 import std.container : DList;
21 
22 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
23 
24 version(unittest)
25 {
26     // When running unit tests, use main from -main compiler switch.
27 }
28 else
29 {
30     int main(string[] cmdArgs)
31     {
32         /* When running in DMD code coverage mode, turn on report merging. */
33         version(D_Coverage) version(DigitalMars)
34         {
35             import core.runtime : dmd_coverSetMerge;
36             dmd_coverSetMerge(true);
37         }
38 
39         TsvSummarizeOptions cmdopt;
40         auto r = cmdopt.processArgs(cmdArgs);
41         if (!r[0]) return r[1];
42         version(LDC_Profile)
43         {
44             import ldc.profile : resetAll;
45             resetAll();
46         }
47         try tsvSummarize(cmdopt);
48         catch (Exception exc)
49         {
50             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
51             return 1;
52         }
53         return 0;
54     }
55 }
56 
57 auto helpTextVerbose = q"EOS
58 Synopsis: tsv-summarize [options] file [file...]
59 
60 tsv-summarize reads tabular data files (tab-separated by default), tracks
61 field values for each unique key, and runs summarization algorithms. Consider
62 the file data.tsv:
63 
64   Make    Color   Time
65   ford    blue    131
66   chevy   green   124
67   ford    red     128
68   bmw     black   118
69   bmw     black   126
70   ford    blue    122
71 
72 The min and average times for each make is generated by the command:
73 
74   $ tsv-summarize --header --group-by Make --min Time --mean Time data.tsv
75 
76 This produces:
77 
78   Make   Time_min Time_mean
79   ford   122      127
80   chevy  124      124
81   bmw    118      122
82 
83 Using '--group-by Make,Color' will group by both 'Make' and 'Color'.
84 Omitting the '--group-by' entirely summarizes fields for the full file.
85 
86 The previous example uses field names to identify fields. Field numbers
87 can be used as well. The next two commands are equivalent:
88 
89   $ tsv-summarize -H --group-by Make,Color --min Time --mean Time data.tsv
90   $ tsv-summarize -H --group-by 1,2 --min 3 --mean 3 data.tsv
91 
92 The program tries to generate useful headers, but custom headers can be
93 specified. Example (using -g and -H shortcuts for --header and --group-by):
94 
95   $ tsv-summarize -H -g 1 --min 3:Fastest --mean 3:Average data.tsv
96 
97 Most operators take custom headers in a similarly way, generally following:
98 
99   --<operator-name> FIELD[:header]
100 
101 Operators can be specified multiple times. They can also take multiple
102 fields (though not when a custom header is specified). Examples:
103 
104   --median 2,3,4
105   --median 2-5,7-11
106   --median elapsed_time,system_time,user_time
107   --median '*_time'              # Wildcard. All fields ending in '_time'.
108 
109 The quantile operator requires one or more probabilities after the fields:
110 
111   --quantile run_time:0.25       # Quantile 1 of the 'run_time' field
112   --quantile 2:0.25              # Quantile 1 of field 2
113   --quantile 2-4:0.25,0.5,0.75   # Q1, Median, Q3 of fields 2, 3, 4
114 
115 Summarization operators available are:
116   count       range        mad            values
117   retain      sum          var            unique-values
118   first       mean         stddev         unique-count
119   last        median       mode           missing-count
120   min         quantile     mode-count     not-missing-count
121   max
122 
123 Calculated numeric values are printed to 12 significant digits by default.
124 This can be changed using the '--p|float-precision' option. If six or less
125 it sets the number of significant digits after the decimal point. If
126 greater than six it sets the total number of significant digits.
127 
128 Calculations hold onto the minimum data needed while reading data. A few
129 operations like median keep all data values in memory. These operations will
130 start to encounter performance issues as available memory becomes scarce. The
131 size that can be handled effectively is machine dependent, but often quite
132 large files can be handled.
133 
134 Operations requiring numeric entries will signal an error and terminate
135 processing if a non-numeric entry is found.
136 
137 Missing values are not treated specially by default, this can be changed
138 using the '--x|exclude-missing' or '--r|replace-missing' option. The former
139 turns off processing for missing values, the latter uses a replacement value.
140 
141 Options:
142 EOS";
143 
144 auto helpText = q"EOS
145 Synopsis: tsv-summarize [options] file [file...]
146 
147 tsv-summarize runs aggregation operations on fields in tab-separated value
148 files. Operations can be run against the full input data or grouped by key
149 fields. Fields can be specified either by field number or field name. Use
150 '--help-verbose' for more detailed help.
151 
152 Options:
153 EOS";
154 
155 /** Command line options - Container and processing. The processArgs method is used to
156  * process the command line.
157  */
158 struct TsvSummarizeOptions {
159     import tsv_utils.common.utils : byLineSourceRange, ByLineSourceRange;
160 
161     string programName;                /// Program name
162     ByLineSourceRange!() inputSources; /// Input Files
163     size_t[] keyFields;                /// -g, --group-by
164     bool hasHeader = false;            /// --header
165     bool writeHeader = false;          /// -w, --write-header
166     char inputFieldDelimiter = '\t';   /// --d|delimiter
167     char valuesDelimiter = '|';        /// --v|values-delimiter
168     size_t floatPrecision = 12;        /// --p|float-precision
169     DList!Operator operators;          /// Operators, in the order specified.
170     size_t endFieldIndex = 0;          /// Derived value. Max field index used plus one.
171     MissingFieldPolicy globalMissingPolicy = new MissingFieldPolicy;   /// Derived value.
172 
173     /* tsv-summarize operators require access to the header line when the operator is
174      * created. This is because named fields may be used to describe fields names. To
175      * enable this, a CmdOptionHandler delegate is added to the cmdLinOperatorOptions
176      * array during during initial processing by std.getopt. The group-by operation is
177      * similar, but is added to the cmdLineOtherFieldOptions instead. At least one
178      * cmdLineOperatorOptions entry is required.
179      *
180      * The different handlers are defined after processArgs.
181      */
182 
183     /* CmdOptionHandler delegate signature - This is the call made to process the command
184      * line option arguments after the header line has been read.
185      */
186     alias CmdOptionHandler = void delegate(bool hasHeader, string[] headerFields);
187 
188     private CmdOptionHandler[]  cmdLineOperatorOptions;
189     private CmdOptionHandler[]  cmdLineOtherFieldOptions;
190 
191     /* Returns a tuple. First value is true if command line arguments were successfully
192      * processed and execution should continue, or false if an error occurred or the user
193      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
194      *
195      * Returning true (execution continues) means args have been validated and derived
196      * values calculated. In addition, field indices have been converted to zero-based.
197      */
198     auto processArgs (ref string[] cmdArgs) {
199         import std.algorithm : any, each;
200         import std.getopt;
201         import std.path : baseName, stripExtension;
202         import std.typecons : Yes, No;
203         import tsv_utils.common.fieldlist : fieldListHelpText;
204         import tsv_utils.common.getopt_inorder;
205         import tsv_utils.common.utils : throwIfWindowsNewline;
206 
207         bool helpVerbose = false;          // --help-verbose
208         bool helpFields = false;           // --help-fields
209         bool versionWanted = false;        // --V|version
210         bool excludeMissing = false;       // --x|exclude-missing
211         string missingValueReplacement;    // --r|replace-missing
212 
213 
214         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
215 
216         try
217         {
218             arraySep = ",";    // Use comma to separate values in command line options
219             auto r = getoptInorder(
220                 cmdArgs,
221                 "help-verbose",       "              Print full help.", &helpVerbose,
222                 "help-fields",        "              Print help on specifying fields.", &helpFields,
223 
224                 std.getopt.config.caseSensitive,
225                 "V|version",          "              Print version information and exit.", &versionWanted,
226                 std.getopt.config.caseInsensitive,
227 
228                 "g|group-by",         "<field-list>  Fields to use as key.", &addGroupByOptionHandler,
229 
230                 std.getopt.config.caseSensitive,
231                 "H|header",           "              Treat the first line of each file as a header.", &hasHeader,
232                 std.getopt.config.caseInsensitive,
233 
234                 "w|write-header",     "              Write an output header even if there is no input header.", &writeHeader,
235                 "d|delimiter",        "CHR           Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &inputFieldDelimiter,
236                 "v|values-delimiter", "CHR           Values delimiter. Default: vertical bar (|). (Single byte UTF-8 characters only.)", &valuesDelimiter,
237                 "p|float-precision",  "NUM           'Precision' to use printing floating point numbers. Affects the number of digits printed and exponent use. Default: 12", &floatPrecision,
238                 "x|exclude-missing",  "              Exclude missing (empty) fields from calculations.", &excludeMissing,
239                 "r|replace-missing",  "STR           Replace missing (empty) fields with STR in calculations.", &missingValueReplacement,
240                 "count",              "              Count occurrences of each unique key ('--g|group-by'), or the total number of records if no key field is specified.", &addCountOptionHandler,
241                 "count-header",       "STR           Count occurrences of each unique key, like '--count', but use STR as the header.", &addCountHeaderOptionHandler,
242                 "retain",             "<field-list>  Retain one copy of the field.", &addOperatorOptionHandler!RetainOperator,
243                 "first",              "<field-list>[:STR]  First value seen.", &addOperatorOptionHandler!FirstOperator,
244                 "last",               "<field-list>[:STR]  Last value seen.", &addOperatorOptionHandler!LastOperator,
245                 "min",                "<field-list>[:STR]  Min value. (Fields with numeric values only.)", &addOperatorOptionHandler!MinOperator,
246                 "max",                "<field-list>[:STR]  Max value. (Fields with numeric values only.)", &addOperatorOptionHandler!MaxOperator,
247                 "range",              "<field-list>[:STR]  Difference between min and max values. (Fields with numeric values only.)", &addOperatorOptionHandler!RangeOperator,
248                 "sum",                "<field-list>[:STR]  Sum of the values. (Fields with numeric values only.)", &addOperatorOptionHandler!SumOperator,
249                 "mean",               "<field-list>[:STR]  Mean (average). (Fields with numeric values only.)", &addOperatorOptionHandler!MeanOperator,
250                 "median",             "<field-list>[:STR]  Median value. (Fields with numeric values only. Reads all values into memory.)", &addOperatorOptionHandler!MedianOperator,
251                 "quantile",           "<field-list>:p[,p...][:STR]  Quantiles. One or more fields, then one or more 0.0-1.0 probabilities. (Fields with numeric values only. Reads all values into memory.)", &addQuantileOperatorOptionHandler,
252                 "mad",                "<field-list>[:STR]  Median absolute deviation from the median. Raw value, not scaled. (Fields with numeric values only. Reads all values into memory.)", &addOperatorOptionHandler!MadOperator,
253                 "var",                "<field-list>[:STR]  Variance. (Sample variance, numeric fields only).", &addOperatorOptionHandler!VarianceOperator,
254                 "stdev",              "<field-list>[:STR]  Standard deviation. (Sample st.dev, numeric fields only).", &addOperatorOptionHandler!StDevOperator,
255                 "mode",               "<field-list>[:STR]  Mode. The most frequent value. (Reads all unique values into memory.)", &addOperatorOptionHandler!ModeOperator,
256                 "mode-count",         "<field-list>[:STR]  Count of the most frequent value. (Reads all unique values into memory.)", &addOperatorOptionHandler!ModeCountOperator,
257                 "unique-count",       "<field-list>[:STR]  Number of unique values. (Reads all unique values into memory.)", &addOperatorOptionHandler!UniqueCountOperator,
258                 "missing-count",      "<field-list>[:STR]  Number of missing (empty) fields. Not affected by '--x|exclude-missing' or '--r|replace-missing'.", &addOperatorOptionHandler!MissingCountOperator,
259                 "not-missing-count",  "<field-list>[:STR]  Number of filled (non-empty) fields. Not affected by '--r|replace-missing'.", &addOperatorOptionHandler!NotMissingCountOperator,
260                 "values",             "<field-list>[:STR]  All the values, separated by --v|values-delimiter. (Reads all values into memory.)", &addOperatorOptionHandler!ValuesOperator,
261                 "unique-values",      "<field-list>[:STR]  All the unique values, separated by --v|values-delimiter. (Reads all unique values into memory.)", &addOperatorOptionHandler!UniqueValuesOperator,
262                 );
263 
264             if (r.helpWanted)
265             {
266                 defaultGetoptPrinter(helpText, r.options);
267                 return tuple(false, 0);
268             }
269             else if (helpVerbose)
270             {
271                 defaultGetoptPrinter(helpTextVerbose, r.options);
272                 return tuple(false, 0);
273             }
274             else if (helpFields)
275             {
276                 writeln(fieldListHelpText);
277                 return tuple(false, 0);
278             }
279             else if (versionWanted)
280             {
281                 import tsv_utils.common.tsvutils_version;
282                 writeln(tsvutilsVersionNotice("tsv-summarize"));
283                 return tuple(false, 0);
284             }
285 
286             /* Remaining command line args are files. Use standard input if files
287              * were not provided. Truncate cmdArgs to consume the arguments.
288              */
289             string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
290             cmdArgs.length = 1;
291 
292             /* Validation and derivations - Do as much validation prior to header line
293              * processing as possible (avoids waiting on stdin).
294              */
295 
296             enforce(!cmdLineOperatorOptions.empty, "At least one summary operator is required.");
297 
298             enforce(inputFieldDelimiter != valuesDelimiter,
299                     "Cannot use the same character for both --d|field-delimiter and --v|values-delimiter.");
300 
301             enforce(!(excludeMissing && missingValueReplacement.length != 0),
302                     "Cannot use both '--x|exclude-missing' and '--r|replace-missing'.");
303 
304             /* Missing field policy. */
305             globalMissingPolicy.updatePolicy(excludeMissing, missingValueReplacement);
306 
307             string[] headerFields;
308 
309             /* fieldListArgProcessing encapsulates the field list processing. It is
310              * called prior to reading the header line if headers are not being used,
311              * and after if headers are being used.
312              */
313             void fieldListArgProcessing()
314             {
315                 /* Run all the operator handlers. */
316                 cmdLineOtherFieldOptions.each!(dg => dg(hasHeader, headerFields));
317                 cmdLineOperatorOptions.each!(dg => dg(hasHeader, headerFields));
318 
319                 /* keyFields need to be part of the endFieldIndex, which is one past
320                  * the last field index. */
321                 keyFields.each!(delegate (size_t x)
322                                 {
323                                     if (x >= endFieldIndex) endFieldIndex = x + 1;
324                                 } );
325             }
326 
327             if (!hasHeader) fieldListArgProcessing();
328 
329             /*
330              * Create the byLineSourceRange and perform header line processing.
331              */
332             inputSources = byLineSourceRange(filepaths);
333 
334 
335             if (hasHeader)
336             {
337                 if (!inputSources.front.byLine.empty)
338                 {
339                     throwIfWindowsNewline(inputSources.front.byLine.front, inputSources.front.name, 1);
340                     headerFields = inputSources.front.byLine.front.split(inputFieldDelimiter).to!(string[]);
341                 }
342 
343                 fieldListArgProcessing();
344             }
345         }
346         catch (Exception exc)
347         {
348             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
349             return tuple(false, 1);
350         }
351         return tuple(true, 0);
352     }
353 
354     private void addGroupByOptionHandler(string option, string optionVal)
355     {
356         cmdLineOtherFieldOptions ~=
357             (bool hasHeader, string[] headerFields)
358             => groupByOptionHandler(hasHeader, headerFields, option, optionVal);
359     }
360 
361     private void groupByOptionHandler(bool hasHeader, string[] headerFields, string option, string optionVal)
362     {
363         import tsv_utils.common.fieldlist;
364 
365         try
366         {
367             keyFields =
368                 optionVal
369                 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex)(hasHeader, headerFields)
370                 .array;
371         }
372         catch (Exception e)
373         {
374             e.msg = format("[--%s %s]. %s", option, optionVal, e.msg);
375             throw e;
376         }
377     }
378 
379     private void addOperatorOptionHandler(OperatorClass : SingleFieldOperator)(string option, string optionVal)
380     {
381         cmdLineOperatorOptions ~=
382             (bool hasHeader, string[] headerFields)
383             => operatorOptionHandler!OperatorClass(hasHeader, headerFields, option, optionVal);
384     }
385 
386     /* operationOptionHandler functions are callbacks that process command line options
387      * specifying summarization operations. eg. '--max 5', '--last 3:LastEntry'. Handlers
388      * check syntactic correctness and instantiate Operator objects that do the work. This
389      * is also where 1-upped field numbers are converted to 0-based indices.
390      */
391     private void operatorOptionHandler(OperatorClass : SingleFieldOperator)
392     (bool hasHeader, string[] headerFields, string option, string optionVal)
393     {
394         import std.range : enumerate;
395         import std.typecons : Yes, No;
396         import tsv_utils.common.fieldlist;
397 
398         try
399         {
400             auto optionValParse =
401                 optionVal
402                 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString)
403                 (hasHeader, headerFields);
404 
405             auto fieldIndices = optionValParse.array;
406             bool hasOptionalHeader = optionVal.length > optionValParse.consumed;
407             string optionalHeader;
408 
409             if (hasOptionalHeader)
410             {
411                 enforce(fieldIndices.length <= 1, "Cannot specify a custom header when using multiple fields.");
412                 enforce(optionVal.length - optionValParse.consumed > 1,
413                         format("No value after field list.\n   Expected: '--%s <field-list>' or '--%s <field>:<header>'.",
414                                option, option));
415                 optionalHeader = optionVal[optionValParse.consumed + 1 .. $].idup;
416             }
417 
418             foreach (fieldIndex; fieldIndices)
419             {
420                 auto op = new OperatorClass(fieldIndex, globalMissingPolicy);
421 
422                 if (hasOptionalHeader)
423                 {
424                     enforce(op.allowCustomHeader, "Operator does not support custom headers.");
425                     op.setCustomHeader(optionalHeader);
426                 }
427 
428                 operators.insertBack(op);
429                 if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1;
430             }
431         }
432         catch (Exception exc)
433         {
434             import std.format : format;
435             exc.msg = format("[--%s %s] %s", option, optionVal, exc.msg);
436             throw exc;
437         }
438     }
439 
440     private void addQuantileOperatorOptionHandler(string option, string optionVal)
441     {
442         cmdLineOperatorOptions ~=
443             (bool hasHeader, string[] headerFields)
444             => quantileOperatorOptionHandler(hasHeader, headerFields, option, optionVal);
445     }
446 
447     /* QuantileOperator has a different syntax and needs a custom command option handler. */
448     private void quantileOperatorOptionHandler(bool hasHeader, string[] headerFields, string option, string optionVal)
449     {
450         import std.typecons : Yes, No;
451         import tsv_utils.common.fieldlist;
452 
453         try
454         {
455             auto optionValParse =
456                 optionVal
457                 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString)
458                 (hasHeader, headerFields);
459 
460             auto fieldIndices = optionValParse.array;
461             enforce(optionVal.length - optionValParse.consumed > 1, "No probabilities entered.");
462 
463             auto splitRemaining =
464                 optionVal[optionValParse.consumed + 1 .. $]
465                 .findSplit(":");
466 
467             enforce(splitRemaining[1].empty || !splitRemaining[2].empty,
468                     "Empty custom header.");
469 
470             auto probStr = splitRemaining[0];
471             auto header = splitRemaining[2];
472 
473             double[] probs;
474 
475             foreach (str; probStr.splitter(','))
476             {
477                 double p = str.to!double;
478                 enforce(p >= 0.0 && p <= 1.0,
479                         format("Probability '%g' is not in the interval [0.0,1.0].", p));
480                 probs ~= p;
481             }
482 
483             enforce(header.empty || (fieldIndices.length <= 1 && probs.length <= 1),
484                     format("Cannot specify a custom header when using multiple fields or multiple probabilities."));
485 
486             assert (fieldIndices.length > 0);
487             assert (probs.length > 0);
488             assert (header.empty || (fieldIndices.length == 1 && probs.length == 1));
489 
490             foreach (fieldIndex; fieldIndices)
491             {
492                 foreach (p; probs)
493                 {
494                     auto op = new QuantileOperator(fieldIndex, globalMissingPolicy, p);
495                     if (!header.empty) op.setCustomHeader(header);
496                     operators.insertBack(op);
497                 }
498                 if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1;
499             }
500         }
501         catch (Exception e)
502         {
503             e.msg = format(
504                 "[--%s %s]. %s\n   Expected: '--%s <field-list>:<prob>[,<prob>]' or '--%s <field>:<prob>:<header>' where <prob> is a number between 0.0 and 1.0.",
505                 option, optionVal, e.msg, option, option);
506             throw e;
507         }
508 
509     }
510 
511     private void addCountOptionHandler()
512     {
513         cmdLineOperatorOptions ~=
514             (bool hasHeader, string[] headerFields)
515             => countOptionHandler(hasHeader, headerFields);
516     }
517 
518     private void countOptionHandler(bool hasHeader, string[] headerFields)
519     {
520         operators.insertBack(new CountOperator());
521     }
522 
523    private  void addCountHeaderOptionHandler(string option, string optionVal)
524     {
525         cmdLineOperatorOptions ~=
526             (bool hasHeader, string[] headerFields)
527             => countHeaderOptionHandler(hasHeader, headerFields, option, optionVal);
528     }
529 
530     private void countHeaderOptionHandler(bool hasHeader, string[] headerFields, string option, string optionVal)
531     {
532         auto op = new CountOperator();
533         op.setCustomHeader(optionVal);
534         operators.insertBack(op);
535     }
536 }
537 
538 /** tsvSummarize does the primary work of the tsv-summarize program.
539  */
540 void tsvSummarize(ref TsvSummarizeOptions cmdopt)
541 {
542     import tsv_utils.common.utils : BufferedOutputRange, ByLineSourceRange,
543         bufferedByLine, throwIfWindowsNewline;
544 
545     /* Check that the input files were setup as expected. Should at least have one
546      * input, stdin if nothing else, and newlines removed from the byLine range.
547      */
548     assert(!cmdopt.inputSources.empty);
549     static assert(is(typeof(cmdopt.inputSources) == ByLineSourceRange!(No.keepTerminator)));
550 
551     /* BufferedOutputRange is faster than writing directly to stdout if many lines are
552      * being written. This will happen mostly when group-by is used.
553      */
554     auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
555 
556     /* Pick the Summarizer based on the number of key-fields entered. */
557     auto summarizer =
558         (cmdopt.keyFields.length == 0)
559         ? new NoKeySummarizer!(typeof(bufferedOutput))(
560             cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
561 
562         : (cmdopt.keyFields.length == 1)
563         ? new OneKeySummarizer!(typeof(bufferedOutput))(
564             cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
565 
566         : new MultiKeySummarizer!(typeof(bufferedOutput))(
567             cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy);
568 
569     /* Add the operators to the Summarizer. */
570     summarizer.setOperators(inputRangeObject(cmdopt.operators[]));
571 
572     /* If there's no input header line, but writing an output header anyway, then
573      * write it now. This helps tasks further on in a unix pipeline detect errors
574      * quickly, without waiting for all the data to flow through the pipeline.
575      */
576     auto printOptions = SummarizerPrintOptions(
577         cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision);
578 
579     if (!cmdopt.hasHeader && cmdopt.writeHeader)
580     {
581         summarizer.writeSummaryHeader(bufferedOutput, printOptions);
582         bufferedOutput.flush;
583     }
584 
585     /* Process each input file, one line at a time. */
586     auto lineFields = new char[][](cmdopt.endFieldIndex);
587     bool headerFound = false;
588     foreach (inputStream; cmdopt.inputSources)
589     {
590         foreach (lineNum, line; inputStream.byLine.enumerate(1))
591         {
592             if (lineNum == 1) throwIfWindowsNewline(line, inputStream.name, lineNum);
593 
594             /* Copy the needed number of fields to the fields array.
595              * Note: The number is zero if no operator needs fields. Notably, the count
596              * operator. Used by itself, it counts the number input lines (ala 'wc -l').
597              */
598             if (cmdopt.endFieldIndex > 0)
599             {
600                 size_t fieldIndex = 0;
601                 foreach (fieldValue; line.splitter(cmdopt.inputFieldDelimiter))
602                 {
603                     if (fieldIndex == cmdopt.endFieldIndex) break;
604                     lineFields[fieldIndex] = fieldValue;
605                     fieldIndex++;
606                 }
607 
608                 if (fieldIndex == 0)
609                 {
610                     assert(cmdopt.endFieldIndex > 0);
611                     assert(line.length == 0);
612 
613                     /* Bug work-around. Empty lines are not handled properly by splitter.
614                      *   - Bug: https://issues.dlang.org/show_bug.cgi?id=15735
615                      *   - Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030
616                      * This can arise for: '$ tsv-summarize -g 1 --count'. This counts the
617                      * unique values in field 1. If there's only one column, then an empty
618                      * line becomes an empty string for field 1. Work-around: Point to the
619                      * line. It's an empty string.
620                      */
621                     lineFields[fieldIndex] = line;
622                     fieldIndex++;
623                 }
624 
625                 enforce(fieldIndex >= cmdopt.endFieldIndex,
626                         format("Not enough fields in line. File: %s, Line: %s",
627                                inputStream.name, lineNum));
628             }
629 
630             if (cmdopt.hasHeader && lineNum == 1)
631             {
632                 if (!headerFound)
633                 {
634                     summarizer.processHeaderLine(lineFields);
635                     headerFound = true;
636 
637                     /* Write the header now. This helps tasks further on in a unix
638                      * pipeline detect errors quickly, without waiting for all the
639                      * data to flow through the pipeline. Note that an upstream task
640                      * may have flushed its header line, so the header may arrive
641                      * long before the main block of data.
642                      */
643                     summarizer.writeSummaryHeader(bufferedOutput, printOptions);
644                     bufferedOutput.flush;
645                 }
646             }
647             else
648             {
649                 /* Process the line. Processing will fail (throw) if a field cannot be
650                  * converted to the expected type.
651                  */
652                 try summarizer.processNextLine(lineFields);
653                 catch (Exception exc)
654                 {
655                     throw new Exception(
656                         format("Could not process line or field: %s\n  File: %s Line: %s%s",
657                                exc.msg, inputStream.name, lineNum,
658                                (lineNum == 1) ? "\n  Is this a header line? Use --header to skip." : ""));
659                 }
660             }
661         }
662     }
663 
664     debug writeln("[tsvSummarize] After reading all data.");
665 
666     /* Whew! We're done processing input data. Run the calculations and print. */
667 
668     summarizer.writeSummaryBody(bufferedOutput, printOptions);
669 }
670 
671 /** The default field header. This is used when the input doesn't have field headers,
672  * but field headers are used in the output. The default is "fieldN", where N is the
673  * 1-upped field number.
674  */
675 string fieldHeaderFromIndex(size_t fieldIndex)
676 {
677     enum prefix = "field";
678     return prefix ~ (fieldIndex + 1).to!string;
679 }
680 
681 unittest
682 {
683     assert(fieldHeaderFromIndex(0) == "field1");
684     assert(fieldHeaderFromIndex(10) == "field11");
685 }
686 
687 /** Produce a summary header from a field header.
688  *
689  * The result has the form `<fieldHeader>_<operation>`. e.g. If the field header is
690  * "length" and the operation is "max", the summary header is "length_max". The field
691  * header typically comes a header line in the input data or was constructed by
692  * fieldHeaderFromIndex().
693  *
694  * If operationName is the empty string, then fieldHeader is used unchanged. This supports
695  * the Retain operator.
696  */
697 string summaryHeaderFromFieldHeader(string fieldHeader, string operationName)
698 {
699     return (operationName.length > 0) ? fieldHeader ~ "_" ~ operationName : fieldHeader;
700 }
701 
702 unittest
703 {
704     assert(summaryHeaderFromFieldHeader("originalfield", "mycalc") == "originalfield_mycalc");
705     assert(summaryHeaderFromFieldHeader("originalfield", "") == "originalfield");
706 }
707 
708 /** SummarizerPrintOptions holds printing options for Summarizers and Calculators. Typically
709  * specified with command line options, it is separated out for modularity.
710  */
711 struct SummarizerPrintOptions
712 {
713     char fieldDelimiter;
714     char valuesDelimiter;
715     size_t floatPrecision = 12;
716 
717     import std.traits : isFloatingPoint, isIntegral;
718 
719     auto formatNumber(T)(T n) const
720     if (isFloatingPoint!T || isIntegral!T)
721     {
722         import tsv_utils.common.numerics : formatNumber;
723         return formatNumber!T(n, floatPrecision);
724     }
725 }
726 
727 /** A Summarizer object maintains the state of the summarization and performs basic
728  * processing. Handling of files and input lines is left to the caller.
729  *
730  * Classes supporting the Summarizer must implement the methods:
731  *  - setOperators - Called after initializing the object for each operator to be processed.
732  *  - processHeaderLine - Called to process the header line of each file. Returns true if
733  *   it was the first header line processed (used when reading multiple files).
734  * - processNextLine - Called to process non-header lines.
735  * - writeSummaryHeader - Called to write the header line.
736  * - writeSummaryBody - Called to write the result lines.
737  *
738  */
739 interface Summarizer(OutputRange)
740 {
741     /** Called after initializing the object for each operator to be processed. */
742     void setOperators(InputRange!Operator op);
743 
744     /** Called to process the header line of each file. Returns true if it was the
745      *  first header line processed (used when reading multiple files).
746      */
747     bool processHeaderLine(const char[][] lineFields);
748 
749     /** Called to process non-header lines. */
750     void processNextLine(const char[][] lineFields);
751 
752     /** Called to write the header line. */
753     void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions);
754 
755     /** Called to write the result lines. */
756     void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions);
757 }
758 
759 /** SummarizerBase performs work shared by all sumarizers, most everything except for
760  * handling of unique keys.
761  *
762  * The base class handles creation, allocates storage for Operators and SharedFieldValues,
763  * and similar. Derived classes deal primarily with unique keys and the associated Calculators
764  * and UniqueKeyValuesLists.
765  */
766 class SummarizerBase(OutputRange) : Summarizer!OutputRange
767 {
768     private char _inputFieldDelimiter;
769     private bool _hasProcessedFirstHeaderLine = false;
770     private SharedFieldValues _sharedFieldValues = null;  // Null if no shared field value lists.
771     protected MissingFieldPolicy _missingPolicy;
772     protected DList!Operator _operators;
773     protected size_t _numOperators = 0;
774 
775     this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
776     {
777         _inputFieldDelimiter = inputFieldDelimiter;
778         _missingPolicy = missingPolicy;
779     }
780 
781     char inputFieldDelimiter() const @property
782     {
783         return _inputFieldDelimiter;
784     }
785 
786     /** Sets the Operators used by the Summarizer. Called after construction. */
787     void setOperators(InputRange!Operator operators)
788     {
789         foreach (op; operators)
790         {
791             _operators.insertBack(op);
792             _numOperators++;
793             auto numericFieldsToSave = op.numericFieldsToSave();
794             auto textFieldsToSave = op.textFieldsToSave();
795 
796             if (numericFieldsToSave.length > 0 || textFieldsToSave.length > 0)
797             {
798                 if (_sharedFieldValues is null)
799                 {
800                     _sharedFieldValues = new SharedFieldValues();
801                 }
802                 numericFieldsToSave.each!(x => _sharedFieldValues.addNumericIndex(x));
803                 textFieldsToSave.each!(x => _sharedFieldValues.addTextIndex(x));
804             }
805         }
806     }
807 
808     /** Called to process the header line of each file. Returns true if it was the
809      *  first header line processed (used when reading multiple files).
810      */
811     bool processHeaderLine(const char[][] lineFields)
812     {
813         if (!_hasProcessedFirstHeaderLine)
814         {
815             _operators.each!(x => x.processHeaderLine(lineFields));
816             _hasProcessedFirstHeaderLine = true;
817             return true;
818         }
819         else
820         {
821             return false;
822         }
823     }
824 
825     protected final UniqueKeyValuesLists makeUniqueKeyValuesLists()
826     {
827         return (_sharedFieldValues is null)
828             ? null
829             : _sharedFieldValues.makeUniqueKeyValuesLists;
830     }
831 
832     abstract void processNextLine(const char[][] lineFields);
833     abstract void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions);
834     abstract void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions);
835 }
836 
837 /** The NoKeySummarizer is used when summarizing values across the entire input.
838  *
839  * Note: NoKeySummarizer is used in Operator unit tests and gets extensive testing
840  * through that mechanism.
841  */
842 final class NoKeySummarizer(OutputRange) : SummarizerBase!OutputRange
843 {
844     private Calculator[] _calculators;
845     private UniqueKeyValuesLists _valueLists;
846 
847     this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
848     {
849         super(inputFieldDelimiter, missingPolicy);
850     }
851 
852     /** Called after initializing the object for each operator to be processed. */
853     override void setOperators(InputRange!Operator operators)
854     {
855         super.setOperators(operators);
856 
857         /* Only one Calculator per Operation, so create them as Operators are added. */
858         foreach (op; operators) _calculators ~= op.makeCalculator;
859         _valueLists = super.makeUniqueKeyValuesLists();
860     }
861 
862      /** Called to process non-header lines. */
863     override void processNextLine(const char[][] lineFields)
864     {
865         _calculators.each!(x => x.processNextLine(lineFields));
866         if (_valueLists !is null) _valueLists.processNextLine(lineFields, _missingPolicy);
867     }
868 
869     /** Called to write the header line. */
870     override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
871     {
872         put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter));
873         put(outputStream, '\n');
874     }
875 
876     /** Called to write the result lines. */
877     override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
878     {
879         put(outputStream,
880             _calculators[]
881             .map!(x => x.calculate(_valueLists, printOptions))
882             .join(printOptions.fieldDelimiter));
883         put(outputStream, '\n');
884     }
885 }
886 
887 /** KeySummarizerBase does work shared by the single key and multi-key summarizers.
888  *
889  * The primary difference between those two is the formation of the key. The primary
890  * reason for separating those into two separate classes is to simplify (speed-up)
891  * handling of single field keys, which are the most common use case.
892  */
893 class KeySummarizerBase(OutputRange) : SummarizerBase!OutputRange
894 {
895     protected struct UniqueKeyData
896     {
897         Calculator[] calculators;
898         UniqueKeyValuesLists valuesLists;
899     }
900 
901     private DList!string _uniqueKeys;
902     private UniqueKeyData[string] _uniqueKeyData;
903 
904     this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
905     {
906         super(inputFieldDelimiter, missingPolicy);
907     }
908 
909     protected void processNextLineWithKey(T : const char[])(T key, const char[][] lineFields)
910     {
911         debug writefln("[%s]: %s", __FUNCTION__, lineFields.to!string);
912 
913         auto dataPtr = (key in _uniqueKeyData);
914         auto data = (dataPtr is null) ? addUniqueKey(key.to!string) : *dataPtr;
915 
916         data.calculators.each!(x => x.processNextLine(lineFields));
917         if (data.valuesLists !is null) data.valuesLists.processNextLine(lineFields, _missingPolicy);
918     }
919 
920     protected UniqueKeyData addUniqueKey(string key)
921     {
922         assert(key !in _uniqueKeyData);
923 
924         _uniqueKeys.insertBack(key);
925 
926         auto calculators = new Calculator[_numOperators];
927         size_t i = 0;
928         foreach (op; _operators)
929         {
930             calculators[i] = op.makeCalculator;
931             i++;
932         }
933 
934         return _uniqueKeyData[key] = UniqueKeyData(calculators, super.makeUniqueKeyValuesLists());
935     }
936 
937     override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
938     {
939         put(outputStream, keyFieldHeader());
940         put(outputStream, printOptions.fieldDelimiter);
941         put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter));
942         put(outputStream, '\n');
943     }
944 
945     override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
946     {
947         foreach(key; _uniqueKeys)
948         {
949             auto data = _uniqueKeyData[key];
950             put(outputStream, key);
951             put(outputStream, printOptions.fieldDelimiter);
952             put(outputStream,
953                 data.calculators[]
954                 .map!(x => x.calculate(data.valuesLists, printOptions))
955                 .join(printOptions.fieldDelimiter));
956             put(outputStream, '\n');
957         }
958     }
959 
960     abstract string keyFieldHeader() const @property;
961 }
962 
963 /** This Summarizer is for the case where the unique key is based on exactly one field.
964  */
965 final class OneKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange
966 {
967     private size_t _keyFieldIndex = 0;
968     private string _keyFieldHeader;
969     private DList!string _uniqueKeys;
970 
971     this(size_t keyFieldIndex, char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
972     {
973         super(inputFieldDelimiter, missingPolicy);
974         _keyFieldIndex = keyFieldIndex;
975         _keyFieldHeader = fieldHeaderFromIndex(keyFieldIndex);
976     }
977 
978     override string keyFieldHeader() const @property
979     {
980         return _keyFieldHeader;
981     }
982 
983     override bool processHeaderLine(const char[][] lineFields)
984     {
985         assert(_keyFieldIndex <= lineFields.length);
986 
987         bool isFirstHeaderLine = super.processHeaderLine(lineFields);
988         if (isFirstHeaderLine)
989         {
990             _keyFieldHeader = lineFields[_keyFieldIndex].to!string;
991         }
992         return isFirstHeaderLine;
993     }
994 
995     override void processNextLine(const char[][] lineFields)
996     {
997         assert(_keyFieldIndex < lineFields.length);
998         processNextLineWithKey(lineFields[_keyFieldIndex], lineFields);
999     }
1000 }
1001 
1002 /** This Summarizer is for the case where the unique key is based on multiple fields.
1003  */
1004 final class MultiKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange
1005 {
1006     private size_t[] _keyFieldIndices;
1007     private string _keyFieldHeader;
1008     private DList!string _uniqueKeys;
1009 
1010     this(const size_t[] keyFieldIndices, char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
1011     {
1012         super(inputFieldDelimiter, missingPolicy);
1013         _keyFieldIndices = keyFieldIndices.dup;
1014         _keyFieldHeader =
1015             _keyFieldIndices.map!(i => fieldHeaderFromIndex(i))
1016             .join(inputFieldDelimiter);
1017     }
1018 
1019     override string keyFieldHeader() const @property
1020     {
1021         return _keyFieldHeader;
1022     }
1023 
1024     override bool processHeaderLine(const char[][] lineFields)
1025     {
1026         assert(_keyFieldIndices.all!(x => x < lineFields.length));
1027         assert(_keyFieldIndices.length >= 2);
1028 
1029         bool isFirstHeaderLine = super.processHeaderLine(lineFields);
1030         if (isFirstHeaderLine)
1031         {
1032             _keyFieldHeader = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string;
1033         }
1034         return isFirstHeaderLine;
1035     }
1036 
1037     override void processNextLine(const char[][] lineFields)
1038     {
1039         assert(_keyFieldIndices.all!(x => x < lineFields.length));
1040         assert(_keyFieldIndices.length >= 2);
1041 
1042         string key = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string;
1043         processNextLineWithKey(key, lineFields);
1044     }
1045 }
1046 
1047 version(unittest)
1048 {
1049     /* testSummarizer is a helper that can run many types of unit tests against
1050      * Summarizers. It can also test operators, but there are separate helper functions
1051      * better suited for that purpose.
1052      *
1053      * Arguments are a command line args, an input file, and expected output. The
1054      * input file and expected output are already split into lines and fields, the helper
1055      * manages re-assembly. The program name from the command line args is printed if an
1056      * an error occurs, it is useful to identify the test that failed.
1057      *
1058      * Note: Much of this is a duplication tsvSummarize logic. Better abstraction of
1059      * file input/output would enable running unit tests directly on top of tsvSummarize.
1060      *
1061      * Update (April 2020): With the introduction of InputSourceRange and ByLineSource,
1062      * there needs to be a physical file when call processArgs. Its hard to get around,
1063      * as the intent is to read the header line of the first input file during command
1064      * line argument processing. Eventually this unit test process will need to be
1065      * rewritten. For now, a file with the equivalent data is being added to the command
1066      * line.
1067      *
1068      * Update (Sept 2020): The physical file needs to be closed for unit tests on
1069      * Windows. This is so the temporary file can be deleted without trouble. Since its
1070      * a placeholder in these tests, it's getting iterated but not popped off the
1071      * inputSources and closed. Normal collection is not closing it quick enought. So
1072      * all inputSources are closed at the end of this function.
1073      */
1074     void testSummarizer(string[] cmdArgs, string[][] file, string[][] expected)
1075     {
1076         import std.array : appender;
1077 
1078         assert(cmdArgs.length > 0, "[testSummarizer] cmdArgs must not be empty.");
1079 
1080         auto formatAssertMessage(T...)(string msg, T formatArgs)
1081         {
1082             auto formatString = "[testSummarizer] %s: " ~ msg;
1083             return format(formatString, cmdArgs[0], formatArgs);
1084         }
1085 
1086         TsvSummarizeOptions cmdopt;
1087         auto savedCmdArgs = cmdArgs.to!string;
1088         auto r = cmdopt.processArgs(cmdArgs);
1089         assert(r[0], formatAssertMessage("Invalid command line args: '%s'.", savedCmdArgs));
1090 
1091         assert(file.all!(line => line.length >= cmdopt.endFieldIndex),
1092                formatAssertMessage("group-by or operator field number greater than number of fields a line of the input file."));
1093 
1094         /* Pick the Summarizer based on the number of key-fields entered. */
1095         auto summarizer =
1096             (cmdopt.keyFields.length == 0)
1097             ? new NoKeySummarizer!(typeof(appender!(char[])()))(
1098                 cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
1099 
1100             : (cmdopt.keyFields.length == 1)
1101             ? new OneKeySummarizer!(typeof(appender!(char[])()))(
1102                 cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
1103 
1104             : new MultiKeySummarizer!(typeof(appender!(char[])()))(
1105                 cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy);
1106 
1107         /* Add the operators to the Summarizer. */
1108         summarizer.setOperators(inputRangeObject(cmdopt.operators[]));
1109 
1110         /* Process the file one line at a time. */
1111         auto lineFields = new char[][](cmdopt.endFieldIndex);
1112         bool headerFound = false;
1113         foreach (lineNum, line; file.enumerate(1))
1114         {
1115             /* Copy the needed fields to the fields array. */
1116             foreach (i, val; line[0..cmdopt.endFieldIndex]) lineFields[i] = val.dup;
1117 
1118             if (cmdopt.hasHeader && lineNum == 1)
1119             {
1120                 if (!headerFound)
1121                 {
1122                     summarizer.processHeaderLine(lineFields);
1123                     headerFound = true;
1124                 }
1125             }
1126             else
1127             {
1128                 try summarizer.processNextLine(lineFields);
1129                 catch (Exception exc)
1130                 {
1131                     assert(false, formatAssertMessage(exc.msg));
1132                 }
1133             }
1134         }
1135         auto printOptions = SummarizerPrintOptions(
1136         cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision);
1137 
1138         auto summarizerOutput = appender!(char[])();
1139 
1140         if (cmdopt.hasHeader || cmdopt.writeHeader)
1141         {
1142             summarizer.writeSummaryHeader(summarizerOutput, printOptions);
1143         }
1144 
1145         summarizer.writeSummaryBody(summarizerOutput, printOptions);
1146         auto expectedOutput = expected.map!(x => x.joiner(cmdopt.inputFieldDelimiter.to!string)).joiner("\n").to!string;
1147         if (expectedOutput.length > 0 && expectedOutput[$-1] != '\n') expectedOutput ~= "\n";
1148 
1149         assert(summarizerOutput.data == expectedOutput,
1150                formatAssertMessage(
1151                    "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
1152                    expectedOutput.to!string, summarizerOutput.data.to!string));
1153 
1154         /* Ensure all files are closed by emptying the stack. */
1155         while (!cmdopt.inputSources.empty) cmdopt.inputSources.popFront;
1156     }
1157 
1158     void writeDataFile(string filepath, string[][] fileData, string delimiter = "\t")
1159     {
1160         import std.algorithm;
1161         import std.stdio;
1162 
1163         auto f = filepath.File("wb");
1164         foreach (record; fileData) f.writeln(record.joiner(delimiter));
1165         f.close;
1166     }
1167 }
1168 
1169 unittest
1170 {
1171     import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
1172     import std.file : mkdir, rmdirRecurse;
1173     import std.path : buildPath;
1174 
1175     auto testDir = makeUnittestTempDir("tsv_summarizer");
1176     scope(exit) testDir.rmdirRecurse;
1177 
1178     /* Summarizer unit tests. Primarily single-key and multi-key summarizers. To a limited
1179      * extent, command line option handling (TsvSummarizeOptions). Individual operators
1180      * have separate tests, those tests test the no-key summarizer. The Values operator is
1181      * used in these tests. It engages a number of behaviors, and the results have limited
1182      * ambiguity. Using only one operator limits dependence on individual operators.
1183      *
1184      * Update (April 2020): There now needs to be a real file passed to testSummarizer.
1185      * See the comments with testSummarizer for details.
1186      */
1187 
1188     auto file1 = [["fld1", "fld2", "fld3"],
1189                   ["a", "a",  "3"],
1190                   ["c", "a",  "2b"],
1191                   ["c", "bc", ""],
1192                   ["a", "c",  "2b"],
1193                   ["",  "bc", ""],
1194                   ["c", "bc", "3"]];
1195 
1196     auto file1Path = buildPath(testDir, "file1.tsv");
1197     auto file1NoHeaderPath = buildPath(testDir, "file1_noheader.tsv");
1198     writeDataFile(file1Path, file1);
1199     writeDataFile(file1NoHeaderPath, file1[1 .. $]);
1200 
1201     /* Single-key summarizer tests.
1202      */
1203     testSummarizer(["unittest-sk-1", "--header", "--group-by", "1", "--values", "1", file1Path],
1204                    file1,
1205                    [["fld1", "fld1_values"],
1206                     ["a", "a|a"],
1207                     ["c", "c|c|c"],
1208                     ["",  ""]]
1209         );
1210     testSummarizer(["unittest-sk-1-named", "--header", "--group-by", "fld1", "--values", "fld1", file1Path],
1211                    file1,
1212                    [["fld1", "fld1_values"],
1213                     ["a", "a|a"],
1214                     ["c", "c|c|c"],
1215                     ["",  ""]]
1216         );
1217     testSummarizer(["unittest-sk-2", "-H", "--group-by", "1", "--values", "2", file1Path],
1218                    file1,
1219                    [["fld1", "fld2_values"],
1220                     ["a", "a|c"],
1221                     ["c", "a|bc|bc"],
1222                     ["",  "bc"]]
1223         );
1224     testSummarizer(["unittest-sk-2-named", "-H", "--group-by", "fld1", "--values", "fld2", file1Path],
1225                    file1,
1226                    [["fld1", "fld2_values"],
1227                     ["a", "a|c"],
1228                     ["c", "a|bc|bc"],
1229                     ["",  "bc"]]
1230         );
1231     testSummarizer(["unittest-sk-3", "-H", "-g", "1", "--values", "3", file1Path],
1232                    file1,
1233                    [["fld1", "fld3_values"],
1234                     ["a", "3|2b"],
1235                     ["c", "2b||3"],
1236                     ["",  ""]]
1237         );
1238     testSummarizer(["unittest-sk-4", "-H", "--group-by", "1", "--values", "1,2,3", file1Path],
1239                    file1,
1240                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1241                     ["a", "a|a",   "a|c",     "3|2b"],
1242                     ["c", "c|c|c", "a|bc|bc", "2b||3"],
1243                     ["",  "",      "bc",      ""]]
1244         );
1245     testSummarizer(["unittest-sk-4-named-a", "-H", "--group-by", "fld1", "--values", "fld1,fld2,fld3", file1Path],
1246                    file1,
1247                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1248                     ["a", "a|a",   "a|c",     "3|2b"],
1249                     ["c", "c|c|c", "a|bc|bc", "2b||3"],
1250                     ["",  "",      "bc",      ""]]
1251         );
1252     testSummarizer(["unittest-sk-4-named-b", "-H", "--group-by", "fld1", "--values", "fld*", file1Path],
1253                    file1,
1254                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1255                     ["a", "a|a",   "a|c",     "3|2b"],
1256                     ["c", "c|c|c", "a|bc|bc", "2b||3"],
1257                     ["",  "",      "bc",      ""]]
1258         );
1259     testSummarizer(["unittest-sk-5", "-H", "--group-by", "1", "--values", "1-3", file1Path],
1260                    file1,
1261                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1262                     ["a", "a|a",   "a|c",     "3|2b"],
1263                     ["c", "c|c|c", "a|bc|bc", "2b||3"],
1264                     ["",  "",      "bc",      ""]]
1265         );
1266     testSummarizer(["unittest-sk-6", "-H", "--group-by", "1", "--values", "3,2,1", file1Path],
1267                    file1,
1268                    [["fld1", "fld3_values", "fld2_values", "fld1_values"],
1269                     ["a", "3|2b",  "a|c",     "a|a"],
1270                     ["c", "2b||3", "a|bc|bc", "c|c|c"],
1271                     ["",  "",      "bc",      ""]]
1272         );
1273     testSummarizer(["unittest-sk-7", "-H", "--group-by", "1", "--values", "3-1", file1Path],
1274                    file1,
1275                    [["fld1", "fld3_values", "fld2_values", "fld1_values"],
1276                     ["a", "3|2b",  "a|c",     "a|a"],
1277                     ["c", "2b||3", "a|bc|bc", "c|c|c"],
1278                     ["",  "",      "bc",      ""]]
1279         );
1280     testSummarizer(["unittest-sk-8", "-H", "--group-by", "2", "--values", "1", file1Path],
1281                    file1,
1282                    [["fld2", "fld1_values"],
1283                     ["a",  "a|c"],
1284                     ["bc", "c||c"],
1285                     ["c",  "a"]]
1286         );
1287     testSummarizer(["unittest-sk-9", "-H", "--group-by", "2", "--values", "2", file1Path],
1288                    file1,
1289                    [["fld2", "fld2_values"],
1290                     ["a",  "a|a"],
1291                     ["bc", "bc|bc|bc"],
1292                     ["c",  "c"]]
1293         );
1294     testSummarizer(["unittest-sk-10", "-H", "--group-by", "2", "--values", "3", file1Path],
1295                    file1,
1296                    [["fld2", "fld3_values"],
1297                     ["a",  "3|2b"],
1298                     ["bc", "||3"],
1299                     ["c",  "2b"]]
1300         );
1301     testSummarizer(["unittest-sk-11", "-H", "--group-by", "2", "--values", "1,3", file1Path],
1302                    file1,
1303                    [["fld2", "fld1_values", "fld3_values"],
1304                     ["a",  "a|c",  "3|2b"],
1305                     ["bc", "c||c", "||3"],
1306                     ["c",  "a",    "2b"]]
1307         );
1308     testSummarizer(["unittest-sk-12", "-H", "--group-by", "2", "--values", "3,1", file1Path],
1309                    file1,
1310                    [["fld2", "fld3_values", "fld1_values"],
1311                     ["a",  "3|2b", "a|c"],
1312                     ["bc", "||3",  "c||c"],
1313                     ["c",  "2b",   "a"]]
1314         );
1315     testSummarizer(["unittest-sk-13", "-H", "--group-by", "3", "--values", "1", file1Path],
1316                    file1,
1317                    [["fld3", "fld1_values"],
1318                     ["3",  "a|c"],
1319                     ["2b", "c|a"],
1320                     ["",   "c|"]]
1321         );
1322     testSummarizer(["unittest-sk-14", "-H", "--group-by", "3", "--values", "2", file1Path],
1323                    file1,
1324                    [["fld3", "fld2_values"],
1325                     ["3",  "a|bc"],
1326                     ["2b", "a|c"],
1327                     ["",   "bc|bc"]]
1328         );
1329     testSummarizer(["unittest-sk-15", "-H", "--group-by", "3", "--values", "1,2", file1Path],
1330                    file1,
1331                    [["fld3", "fld1_values", "fld2_values"],
1332                     ["3",  "a|c", "a|bc"],
1333                     ["2b", "c|a", "a|c"],
1334                     ["",   "c|",  "bc|bc"]]
1335         );
1336     testSummarizer(["unittest-sk-15-named", "-H", "--group-by", "fld3", "--values", "fld1,fld2", file1Path],
1337                    file1,
1338                    [["fld3", "fld1_values", "fld2_values"],
1339                     ["3",  "a|c", "a|bc"],
1340                     ["2b", "c|a", "a|c"],
1341                     ["",   "c|",  "bc|bc"]]
1342         );
1343 
1344     /* Multi-key summarizer tests.
1345      */
1346     testSummarizer(["unittest-mk-1", "--header", "--group-by", "1,2", "--values", "1", file1Path],
1347                    file1,
1348                    [["fld1", "fld2", "fld1_values"],
1349                     ["a", "a",  "a"],
1350                     ["c", "a",  "c"],
1351                     ["c", "bc", "c|c"],
1352                     ["a", "c",  "a"],
1353                     ["", "bc",  ""]]
1354         );
1355     testSummarizer(["unittest-mk-2", "-H", "--group-by", "1,2", "--values", "2", file1Path],
1356                    file1,
1357                    [["fld1", "fld2", "fld2_values"],
1358                     ["a", "a",  "a"],
1359                     ["c", "a",  "a"],
1360                     ["c", "bc", "bc|bc"],
1361                     ["a", "c",  "c"],
1362                     ["", "bc",  "bc"]]
1363         );
1364     testSummarizer(["unittest-mk-3", "-H", "--group-by", "1,2", "--values", "3", file1Path],
1365                    file1,
1366                    [["fld1", "fld2", "fld3_values"],
1367                     ["a", "a",  "3"],
1368                     ["c", "a",  "2b"],
1369                     ["c", "bc", "|3"],
1370                     ["a", "c",  "2b"],
1371                     ["", "bc",  ""]]
1372         );
1373     testSummarizer(["unittest-mk-4", "-H", "--group-by", "1,2", "--values", "3,1", file1Path],
1374                    file1,
1375                    [["fld1", "fld2", "fld3_values", "fld1_values"],
1376                     ["a", "a",  "3", "a"],
1377                     ["c", "a",  "2b", "c"],
1378                     ["c", "bc", "|3", "c|c"],
1379                     ["a", "c",  "2b", "a"],
1380                     ["",  "bc", "",   ""]]
1381         );
1382     testSummarizer(["unittest-mk-4-named", "-H", "--group-by", "fld1,fld2", "--values", "fld3,fld1", file1Path],
1383                    file1,
1384                    [["fld1", "fld2", "fld3_values", "fld1_values"],
1385                     ["a", "a",  "3", "a"],
1386                     ["c", "a",  "2b", "c"],
1387                     ["c", "bc", "|3", "c|c"],
1388                     ["a", "c",  "2b", "a"],
1389                     ["",  "bc", "",   ""]]
1390         );
1391     testSummarizer(["unittest-mk-5", "-H", "--group-by", "3,2", "--values", "1", file1Path],
1392                    file1,
1393                    [["fld3", "fld2", "fld1_values"],
1394                     ["3",  "a",  "a"],
1395                     ["2b", "a",  "c"],
1396                     ["",   "bc", "c|"],
1397                     ["2b", "c",  "a"],
1398                     ["3",  "bc", "c"]]
1399         );
1400     testSummarizer(["unittest-mk-6", "-H", "--group-by", "3-2", "--values", "1", file1Path],
1401                    file1,
1402                    [["fld3", "fld2", "fld1_values"],
1403                     ["3",  "a",  "a"],
1404                     ["2b", "a",  "c"],
1405                     ["",   "bc", "c|"],
1406                     ["2b", "c",  "a"],
1407                     ["3",  "bc", "c"]]
1408         );
1409     testSummarizer(["unittest-mk-7", "-H", "--group-by", "2,1,3", "--values", "2", file1Path],
1410                    file1,
1411                    [["fld2", "fld1", "fld3", "fld2_values"],
1412                     ["a",  "a", "3",  "a"],
1413                     ["a",  "c", "2b", "a"],
1414                     ["bc", "c", "",   "bc"],
1415                     ["c",  "a", "2b", "c"],
1416                     ["bc", "",  "",   "bc"],
1417                     ["bc", "c", "3",  "bc"]]
1418         );
1419 
1420     /* Missing policies. */
1421     testSummarizer(["unittest-mis-1", "--header", "--group-by", "1", "--values", "1", "--exclude-missing", file1Path],
1422                    file1,
1423                    [["fld1", "fld1_values"],
1424                     ["a", "a|a"],
1425                     ["c", "c|c|c"],
1426                     ["",  ""]]
1427         );
1428     testSummarizer(["unittest-mis-2", "-H", "--group-by", "1", "--values", "2", "-x", file1Path],
1429                    file1,
1430                    [["fld1", "fld2_values"],
1431                     ["a", "a|c"],
1432                     ["c", "a|bc|bc"],
1433                     ["",  "bc"]]
1434         );
1435     testSummarizer(["unittest-mis-3", "-H", "-g", "1", "--values", "3", "-x", file1Path],
1436                    file1,
1437                    [["fld1", "fld3_values"],
1438                     ["a", "3|2b"],
1439                     ["c", "2b|3"],
1440                     ["",  ""]]
1441         );
1442     testSummarizer(["unittest-mis-4", "-H", "--group-by", "1", "--values", "1,2,3", "-x", file1Path],
1443                    file1,
1444                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1445                     ["a", "a|a",   "a|c",     "3|2b"],
1446                     ["c", "c|c|c", "a|bc|bc", "2b|3"],
1447                     ["",  "",      "bc",      ""]]
1448         );
1449     testSummarizer(["unittest-mis-5", "--header", "--group-by", "1", "--values", "1", "--replace-missing", "NA", file1Path],
1450                    file1,
1451                    [["fld1", "fld1_values"],
1452                     ["a", "a|a"],
1453                     ["c", "c|c|c"],
1454                     ["",  "NA"]]
1455         );
1456     testSummarizer(["unittest-mis-6", "-H", "--group-by", "1", "--values", "2", "-r", "NA", file1Path],
1457                    file1,
1458                    [["fld1", "fld2_values"],
1459                     ["a", "a|c"],
1460                     ["c", "a|bc|bc"],
1461                     ["",  "bc"]]
1462         );
1463     testSummarizer(["unittest-mis-7", "-H", "-g", "1", "--values", "3", "-r", "NA", file1Path],
1464                    file1,
1465                    [["fld1", "fld3_values"],
1466                     ["a", "3|2b"],
1467                     ["c", "2b|NA|3"],
1468                     ["",  "NA"]]
1469         );
1470     testSummarizer(["unittest-mis-7-named", "-H", "-g", "fld1", "--values", "fld3", "-r", "NA", file1Path],
1471                    file1,
1472                    [["fld1", "fld3_values"],
1473                     ["a", "3|2b"],
1474                     ["c", "2b|NA|3"],
1475                     ["",  "NA"]]
1476         );
1477     testSummarizer(["unittest-mis-8", "-H", "--group-by", "1", "--values", "1,2,3", "-r", "NA", file1Path],
1478                    file1,
1479                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1480                     ["a", "a|a",   "a|c",     "3|2b"],
1481                     ["c", "c|c|c", "a|bc|bc", "2b|NA|3"],
1482                     ["",  "NA",      "bc",      "NA"]]
1483         );
1484     testSummarizer(["unittest-mis-9", "-H", "--group-by", "1,2", "--values", "3,1", "-x", file1Path],
1485                    file1,
1486                    [["fld1", "fld2", "fld3_values", "fld1_values"],
1487                     ["a", "a",  "3", "a"],
1488                     ["c", "a",  "2b", "c"],
1489                     ["c", "bc", "3", "c|c"],
1490                     ["a", "c",  "2b", "a"],
1491                     ["",  "bc", "",   ""]]
1492         );
1493     testSummarizer(["unittest-mis-10", "-H", "--group-by", "3,2", "--values", "1", "-x", file1Path],
1494                    file1,
1495                    [["fld3", "fld2", "fld1_values"],
1496                     ["3",  "a",  "a"],
1497                     ["2b", "a",  "c"],
1498                     ["",   "bc", "c"],
1499                     ["2b", "c",  "a"],
1500                     ["3",  "bc", "c"]]
1501         );
1502     testSummarizer(["unittest-mis-11", "-H", "--group-by", "2,1,3", "--values", "2", "-x", file1Path],
1503                    file1,
1504                    [["fld2", "fld1", "fld3", "fld2_values"],
1505                     ["a",  "a", "3",  "a"],
1506                     ["a",  "c", "2b", "a"],
1507                     ["bc", "c", "",   "bc"],
1508                     ["c",  "a", "2b", "c"],
1509                     ["bc", "",  "",   "bc"],
1510                     ["bc", "c", "3",  "bc"]]
1511         );
1512     testSummarizer(["unittest-mis-12", "-H", "--group-by", "1,2", "--values", "3,1", "-r", "NA", file1Path],
1513                    file1,
1514                    [["fld1", "fld2", "fld3_values", "fld1_values"],
1515                     ["a", "a",  "3", "a"],
1516                     ["c", "a",  "2b", "c"],
1517                     ["c", "bc", "NA|3", "c|c"],
1518                     ["a", "c",  "2b", "a"],
1519                     ["",  "bc", "NA",   "NA"]]
1520         );
1521     testSummarizer(["unittest-mis-13", "-H", "--group-by", "3,2", "--values", "1", "-r", "NA", file1Path],
1522                    file1,
1523                    [["fld3", "fld2", "fld1_values"],
1524                     ["3",  "a",  "a"],
1525                     ["2b", "a",  "c"],
1526                     ["",   "bc", "c|NA"],
1527                     ["2b", "c",  "a"],
1528                     ["3",  "bc", "c"]]
1529         );
1530     testSummarizer(["unittest-mis-14", "-H", "--group-by", "2,1,3", "--values", "2", "-r", "NA", file1Path],
1531                    file1,
1532                    [["fld2", "fld1", "fld3", "fld2_values"],
1533                     ["a",  "a", "3",  "a"],
1534                     ["a",  "c", "2b", "a"],
1535                     ["bc", "c", "",   "bc"],
1536                     ["c",  "a", "2b", "c"],
1537                     ["bc", "",  "",   "bc"],
1538                     ["bc", "c", "3",  "bc"]]
1539         );
1540 
1541     /* Validate that the no-key summarizer works with testSummarizer helper function.
1542      */
1543     testSummarizer(["unittest-nk-1", "-H", "--values", "1,2", file1Path],
1544                    file1,
1545                    [["fld1_values", "fld2_values"],
1546                     ["a|c|c|a||c", "a|a|bc|c|bc|bc"]]
1547         );
1548     testSummarizer(["unittest-nk-1-named", "-H", "--values", "fld1,fld2", file1Path],
1549                    file1,
1550                    [["fld1_values", "fld2_values"],
1551                     ["a|c|c|a||c", "a|a|bc|c|bc|bc"]]
1552         );
1553 
1554     /* Header variations: no header line; auto-generated header line; custom headers.
1555      */
1556     testSummarizer(["unittest-hdr-1", "--group-by", "1", "--values", "1", file1NoHeaderPath],
1557                    file1[1..$],
1558                    [["a", "a|a"],
1559                     ["c", "c|c|c"],
1560                     ["",  ""]]
1561         );
1562     testSummarizer(["unittest-hdr-2", "--group-by", "1,2", "--values", "2", file1NoHeaderPath],
1563                    file1[1..$],
1564                    [["a", "a",  "a"],
1565                     ["c", "a",  "a"],
1566                     ["c", "bc", "bc|bc"],
1567                     ["a", "c",  "c"],
1568                     ["", "bc",  "bc"]]
1569         );
1570     testSummarizer(["unittest-hdr-3", "--write-header", "--group-by", "2", "--values", "1", file1NoHeaderPath],
1571                    file1[1..$],
1572                    [["field2", "field1_values"],
1573                     ["a",  "a|c"],
1574                     ["bc", "c||c"],
1575                     ["c",  "a"]]
1576         );
1577     testSummarizer(["unittest-hdr-4", "-w", "--group-by", "3,2", "--values", "1", file1NoHeaderPath],
1578                    file1[1..$],
1579                    [["field3", "field2", "field1_values"],
1580                     ["3",  "a",  "a"],
1581                     ["2b", "a",  "c"],
1582                     ["",   "bc", "c|"],
1583                     ["2b", "c",  "a"],
1584                     ["3",  "bc", "c"]]
1585         );
1586     testSummarizer(["unittest-hdr-5", "-H", "--group-by", "2", "--values", "3:Field3Values", file1Path],
1587                    file1,
1588                    [["fld2", "Field3Values"],
1589                     ["a",  "3|2b"],
1590                     ["bc", "||3"],
1591                     ["c",  "2b"]]
1592         );
1593     testSummarizer(["unittest-hdr-6", "-H", "--group-by", "1,2", "--values", "3:FieldThreeValues", "--values", "1:FieldOneValues", file1Path],
1594                    file1,
1595                    [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"],
1596                     ["a", "a",  "3", "a"],
1597                     ["c", "a",  "2b", "c"],
1598                     ["c", "bc", "|3", "c|c"],
1599                     ["a", "c",  "2b", "a"],
1600                     ["",  "bc", "",   ""]]
1601         );
1602     testSummarizer(["unittest-hdr-6-named-a", "-H", "--group-by", "fld1,fld2", "--values", "fld3:FieldThreeValues", "--values", "fld1:FieldOneValues", file1Path],
1603                    file1,
1604                    [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"],
1605                     ["a", "a",  "3", "a"],
1606                     ["c", "a",  "2b", "c"],
1607                     ["c", "bc", "|3", "c|c"],
1608                     ["a", "c",  "2b", "a"],
1609                     ["",  "bc", "",   ""]]
1610         );
1611     testSummarizer(["unittest-hdr-6-named-b", "-H", "--group-by", "fld1,fld2", "--values", "fld3 FieldThreeValues", "--values", "fld1 FieldOneValues", file1Path],
1612                    file1,
1613                    [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"],
1614                     ["a", "a",  "3", "a"],
1615                     ["c", "a",  "2b", "c"],
1616                     ["c", "bc", "|3", "c|c"],
1617                     ["a", "c",  "2b", "a"],
1618                     ["",  "bc", "",   ""]]
1619         );
1620     testSummarizer(["unittest-hdr-7", "--write-header", "--group-by", "1", "--values", "3:f3_vals","--values", "2:f2_vals", "--values", "1:f1_vals", file1NoHeaderPath],
1621                    file1[1..$],
1622                    [["field1", "f3_vals", "f2_vals", "f1_vals"],
1623                     ["a", "3|2b",  "a|c",     "a|a"],
1624                     ["c", "2b||3", "a|bc|bc", "c|c|c"],
1625                     ["",  "",      "bc",      ""]]
1626         );
1627     testSummarizer(["unittest-hdr-8", "--write-header", "--group-by", "1,3,2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2", file1NoHeaderPath],
1628                    file1[1..$],
1629                    [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"],
1630                     ["a", "3",  "a",  "3",  "a", "a"],
1631                     ["c", "2b", "a",  "2b", "c", "a"],
1632                     ["c", "",   "bc", "",   "c", "bc"],
1633                     ["a", "2b", "c",  "2b", "a", "c"],
1634                     ["",  "",   "bc", "",   "",  "bc"],
1635                     ["c", "3",  "bc", "3",  "c", "bc"]]
1636         );
1637     testSummarizer(["unittest-hdr-9", "--write-header", "--group-by", "1,3-2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2", file1NoHeaderPath],
1638                    file1[1..$],
1639                    [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"],
1640                     ["a", "3",  "a",  "3",  "a", "a"],
1641                     ["c", "2b", "a",  "2b", "c", "a"],
1642                     ["c", "",   "bc", "",   "c", "bc"],
1643                     ["a", "2b", "c",  "2b", "a", "c"],
1644                     ["",  "",   "bc", "",   "",  "bc"],
1645                     ["c", "3",  "bc", "3",  "c", "bc"]]
1646         );
1647 
1648     /* Alternate file widths and lengths.
1649      */
1650 
1651     auto file3x2 = [["fld1", "fld2", "fld3"],
1652                     ["a", "b", "c"],
1653                     ["c", "b", "a"]];
1654 
1655     auto file3x2Path = buildPath(testDir, "file3x2.tsv");
1656     auto file3x2NoHeaderPath = buildPath(testDir, "file3x2_noheader.tsv");
1657     writeDataFile(file3x2Path, file3x2);
1658     writeDataFile(file3x2NoHeaderPath, file3x2[1 .. $]);
1659 
1660     testSummarizer(["unittest-3x2-1", "-H", "--group-by", "1", "--values", "3", file3x2Path],
1661                    file3x2,
1662                    [["fld1", "fld3_values"],
1663                     ["a", "c"],
1664                     ["c", "a"]]
1665         );
1666     testSummarizer(["unittest-3x2-2", "-H", "--group-by", "2", "--values", "3", file3x2Path],
1667                    file3x2,
1668                    [["fld2", "fld3_values"],
1669                     ["b", "c|a"]]
1670         );
1671     testSummarizer(["unittest-3x2-3", "-H", "--group-by", "2,1", "--values", "3", file3x2Path],
1672                    file3x2,
1673                    [["fld2", "fld1", "fld3_values"],
1674                     ["b", "a", "c"],
1675                     ["b", "c", "a"]]
1676         );
1677 
1678     auto file3x1 = [["fld1", "fld2", "fld3"],
1679                     ["a", "b", "c"]];
1680 
1681     auto file3x1Path = buildPath(testDir, "file3x1.tsv");
1682     auto file3x1NoHeaderPath = buildPath(testDir, "file3x1_noheader.tsv");
1683     writeDataFile(file3x1Path, file3x1);
1684     writeDataFile(file3x1NoHeaderPath, file3x1[1 .. $]);
1685 
1686     testSummarizer(["unittest-3x1-1", "-H", "--group-by", "1", "--values", "3", file3x1Path],
1687                    file3x1,
1688                    [["fld1", "fld3_values"],
1689                     ["a", "c"]]
1690         );
1691     testSummarizer(["unittest-3x1-2", "--group-by", "1", "--values", "3", file3x1NoHeaderPath],
1692                    file3x1[1..$],
1693                    [["a", "c"]]
1694         );
1695     testSummarizer(["unittest-3x1-3", "-H", "--group-by", "2,1", "--values", "3", file3x1Path],
1696                    file3x1,
1697                    [["fld2", "fld1", "fld3_values"],
1698                     ["b", "a", "c"]]
1699         );
1700     testSummarizer(["unittest-3x1-3-named", "-H", "--group-by", "fld2,fld1", "--values", "fld3", file3x1Path],
1701                    file3x1,
1702                    [["fld2", "fld1", "fld3_values"],
1703                     ["b", "a", "c"]]
1704         );
1705     testSummarizer(["unittest-3x1-4", "--group-by", "2,1", "--values", "3", file3x1NoHeaderPath],
1706                    file3x1[1..$],
1707                    [["b", "a", "c"]]
1708         );
1709 
1710     auto file3x0 = [["fld1", "fld2", "fld3"]];
1711 
1712     auto file3x0Path = buildPath(testDir, "file3x0.tsv");
1713     auto file3x0NoHeaderPath = buildPath(testDir, "file3x0_noheader.tsv");
1714     writeDataFile(file3x0Path, file3x0);
1715     writeDataFile(file3x0NoHeaderPath, file3x0[1 .. $]);
1716 
1717 
1718     testSummarizer(["unittest-3x0-1", "-H", "--group-by", "1", "--values", "3", file3x0Path],
1719                    file3x0,
1720                    [["fld1", "fld3_values"]]
1721         );
1722     testSummarizer(["unittest-3x0-1-named", "-H", "--group-by", "fld1", "--values", "fld3", file3x0Path],
1723                    file3x0,
1724                    [["fld1", "fld3_values"]]
1725         );
1726     testSummarizer(["unittest-3x0-2", "--group-by", "1", "--values", "3", file3x0NoHeaderPath],
1727                    file3x0[1..$],
1728                    []
1729         );
1730     testSummarizer(["unittest-3x0-3", "--write-header", "--group-by", "1", "--values", "3", file3x0NoHeaderPath],
1731                    file3x0[1..$],
1732                    [["field1", "field3_values"]]
1733         );
1734 
1735 
1736     testSummarizer(["unittest-3x0-4", "-H", "--group-by", "2,1", "--values", "3", file3x0Path],
1737                    file3x0,
1738                    [["fld2", "fld1", "fld3_values"]]
1739         );
1740 
1741     testSummarizer(["unittest-3x0-5", "--group-by", "2,1", "--values", "3", file3x0NoHeaderPath],
1742                    file3x0[1..$],
1743                    []
1744         );
1745 
1746     testSummarizer(["unittest-3x0-6", "--write-header", "--group-by", "2,1", "--values", "3", file3x0NoHeaderPath],
1747                    file3x0[1..$],
1748                    [["field2", "field1", "field3_values"]]
1749         );
1750 
1751     auto file2x1 = [["fld1", "fld2"],
1752                     ["a", "b"]];
1753 
1754     auto file2x1Path = buildPath(testDir, "file2x1.tsv");
1755     auto file2x1NoHeaderPath = buildPath(testDir, "file2x1_noheader.tsv");
1756     writeDataFile(file2x1Path, file2x1);
1757     writeDataFile(file2x1NoHeaderPath, file2x1[1 .. $]);
1758 
1759     testSummarizer(["unittest-2x1-1", "-H", "--group-by", "1", "--values", "2", file2x1Path],
1760                    file2x1,
1761                    [["fld1", "fld2_values"],
1762                     ["a", "b"]]
1763         );
1764     testSummarizer(["unittest-2x1-2", "-H", "--group-by", "2,1", "--values", "1", file2x1Path],
1765                    file2x1,
1766                    [["fld2", "fld1", "fld1_values"],
1767                     ["b", "a", "a"]]
1768         );
1769 
1770     auto file2x0 = [["fld1", "fld2"]];
1771 
1772     auto file2x0Path = buildPath(testDir, "file2x0.tsv");
1773     auto file2x0NoHeaderPath = buildPath(testDir, "file2x0_noheader.tsv");
1774     writeDataFile(file2x0Path, file2x0);
1775     writeDataFile(file2x0NoHeaderPath, file2x0[1 .. $]);
1776 
1777     testSummarizer(["unittest-2x0-1", "-H", "--group-by", "1", "--values", "2", file2x0Path],
1778                    file2x0,
1779                    [["fld1", "fld2_values"]]
1780         );
1781     testSummarizer(["unittest-2x0-2", "-H", "--group-by", "2,1", "--values", "1", file2x0Path],
1782                    file2x0,
1783                    [["fld2", "fld1", "fld1_values"]]
1784         );
1785 
1786     auto file1x2 = [["fld1"],
1787                     ["a"],
1788                     [""]];
1789 
1790     auto file1x2Path = buildPath(testDir, "file1x2.tsv");
1791     auto file1x2NoHeaderPath = buildPath(testDir, "file1x2_noheader.tsv");
1792     writeDataFile(file1x2Path, file1x2);
1793     writeDataFile(file1x2NoHeaderPath, file1x2[1 .. $]);
1794 
1795     testSummarizer(["unittest-1x2-1", "-H", "--group-by", "1", "--values", "1", file1x2Path],
1796                    file1x2,
1797                    [["fld1", "fld1_values"],
1798                     ["a", "a"],
1799                     ["",  ""]]
1800         );
1801 
1802     auto file1x2b = [["fld1"],
1803                      [""],
1804                      [""]];
1805 
1806     auto file1x2bPath = buildPath(testDir, "file1x2b.tsv");
1807     auto file1x2bNoHeaderPath = buildPath(testDir, "file1x2b_noheader.tsv");
1808     writeDataFile(file1x2bPath, file1x2b);
1809     writeDataFile(file1x2bNoHeaderPath, file1x2b[1 .. $]);
1810 
1811     testSummarizer(["unittest-1x2b-2", "-H", "--group-by", "1", "--values", "1", file1x2bPath],
1812                    file1x2b,
1813                    [["fld1", "fld1_values"],
1814                     ["", "|"]]
1815         );
1816 
1817     auto file1x1 = [["fld1"],
1818                     ["x"]];
1819 
1820     auto file1x1Path = buildPath(testDir, "file1x1.tsv");
1821     auto file1x1NoHeaderPath = buildPath(testDir, "file1x1_noheader.tsv");
1822     writeDataFile(file1x1Path, file1x1);
1823     writeDataFile(file1x1NoHeaderPath, file1x1[1 .. $]);
1824 
1825     testSummarizer(["unittest-1x1-1", "-H", "--group-by", "1", "--values", "1", file1x1Path],
1826                    file1x1,
1827                    [["fld1", "fld1_values"],
1828                     ["x", "x"]]
1829         );
1830     testSummarizer(["unittest-1x1-1-named", "-H", "--group-by", "fld1", "--values", "fld1", file1x1Path],
1831                    file1x1,
1832                    [["fld1", "fld1_values"],
1833                     ["x", "x"]]
1834         );
1835 
1836     testSummarizer(["unittest-1x1-2", "--group-by", "1", "--values", "1", file1x1NoHeaderPath],
1837                    file1x1[1..$],
1838                    [["x", "x"]]
1839         );
1840 
1841     testSummarizer(["unittest-1x1-3", "--write-header", "--group-by", "1", "--values", "1", file1x1NoHeaderPath],
1842                    file1x1[1..$],
1843                    [["field1", "field1_values"],
1844                     ["x", "x"]]
1845         );
1846 
1847     auto file1x1b = [["fld1"],
1848                     [""]];
1849 
1850     auto file1x1bPath = buildPath(testDir, "file1x1b.tsv");
1851     auto file1x1bNoHeaderPath = buildPath(testDir, "file1x1b_noheader.tsv");
1852     writeDataFile(file1x1bPath, file1x1b);
1853     writeDataFile(file1x1bNoHeaderPath, file1x1b[1 .. $]);
1854 
1855     testSummarizer(["unittest-1x1b-1", "-H", "--group-by", "1", "--values", "1", file1x1bPath],
1856                    file1x1b,
1857                    [["fld1", "fld1_values"],
1858                     ["", ""]]
1859         );
1860 
1861     auto file1x0 = [["fld1"]];
1862 
1863     auto file1x0Path = buildPath(testDir, "file1x0.tsv");
1864     auto file1x0NoHeaderPath = buildPath(testDir, "file1x0_noheader.tsv");
1865     writeDataFile(file1x0Path, file1x0);
1866     writeDataFile(file1x0NoHeaderPath, file1x0[1 .. $]);
1867 
1868     testSummarizer(["unittest-1x0-1", "-H", "--group-by", "1", "--values", "1", file1x0Path],
1869                    file1x0,
1870                    [["fld1", "fld1_values"]]
1871         );
1872 
1873     testSummarizer(["unittest-1x0-2", "--group-by", "1", "--values", "1", file1x0NoHeaderPath],
1874                    file1x0[1..$],
1875                    []
1876         );
1877 
1878     testSummarizer(["unittest-1x0-3", "--write-header", "--group-by", "1", "--values", "1", file1x0NoHeaderPath],
1879                    file1x0[1..$],
1880                    [["field1", "field1_values"]]
1881         );
1882 
1883     /* Alternate delimiters.
1884      *
1885      * Note: In current unit test setup the data is already in memory (file1).
1886      * 'file1Path' points to a file with equivalent data, but not read, except if
1887      * processing the header line. A data file is created for the '%' and '#'
1888      * delimiter cases (these read the header), but we don't bother for the others.
1889      */
1890     auto file1PctDelimPath = buildPath(testDir, "file1PctDelim.tsv");
1891     auto file1HashDelimPath = buildPath(testDir, "file1HashDelim.tsv");
1892     writeDataFile(file1PctDelimPath, file1, "%");
1893     writeDataFile(file1HashDelimPath, file1, "#");
1894 
1895     testSummarizer(["unittest-delim-1", "-H", "--values", "1,2", "--delimiter", "%", file1PctDelimPath],
1896                    file1,
1897                    [["fld1_values", "fld2_values"],
1898                     ["a|c|c|a||c", "a|a|bc|c|bc|bc"]]
1899         );
1900     testSummarizer(["unittest-delim-1-named", "-H", "--values", "fld1,fld2", "--delimiter", "%", file1PctDelimPath],
1901                    file1,
1902                    [["fld1_values", "fld2_values"],
1903                     ["a|c|c|a||c", "a|a|bc|c|bc|bc"]]
1904         );
1905     testSummarizer(["unittest-delim-2", "-H", "--values", "1-2", "--values-delimiter", "$", file1Path],
1906                    file1,
1907                    [["fld1_values", "fld2_values"],
1908                     ["a$c$c$a$$c", "a$a$bc$c$bc$bc"]]
1909         );
1910     testSummarizer(["unittest-delim-3", "-H", "--values", "1,2", "--delimiter", "#", "--values-delimiter", ",", file1HashDelimPath],
1911                    file1,
1912                    [["fld1_values", "fld2_values"],
1913                     ["a,c,c,a,,c", "a,a,bc,c,bc,bc"]]
1914         );
1915     testSummarizer(["unittest-delim-3-named", "-H", "--values", "fld1,fld2", "--delimiter", "#", "--values-delimiter", ",", file1HashDelimPath],
1916                    file1,
1917                    [["fld1_values", "fld2_values"],
1918                     ["a,c,c,a,,c", "a,a,bc,c,bc,bc"]]
1919         );
1920     testSummarizer(["unittest-delim-4", "--write-header", "--group-by", "2", "--values", "1",
1921                     "--delimiter", "^", "--values-delimiter", ":", file1NoHeaderPath],
1922                    file1[1..$],
1923                    [["field2", "field1_values"],
1924                     ["a",  "a:c"],
1925                     ["bc", "c::c"],
1926                     ["c",  "a"]]
1927         );
1928     testSummarizer(["unittest-delim-5", "--group-by", "1,2", "--values", "2", "--delimiter", "/",
1929                     "--values-delimiter", "\\", file1NoHeaderPath],
1930                    file1[1..$],
1931                    [["a", "a",  "a"],
1932                     ["c", "a",  "a"],
1933                     ["c", "bc", "bc\\bc"],
1934                     ["a", "c",  "c"],
1935                     ["", "bc",  "bc"]]
1936         );
1937 }
1938 
1939 /* Summary Operators and Calculators
1940  *
1941  * Two types of objects are used in implementation: Operators and Calculators. An Operator
1942  * represents a summary calculation specified on the command line, e.g. '--mean 5'. A
1943  * Calculator is used to manage the summary calculation for each unique key in the input.
1944  *
1945  * As an example, consider the command:
1946  *
1947  *    $tsv-summarize --group-by 1 --mean 3 --mean 5
1948  *
1949  * This command will create two instances of a MeanOperator, one each for fields 3 and 5.
1950  * They produce the output field headers (e.g. "field3_mean", "field5_mean"). They also
1951  * create MeanCalculator objects for each unique value in field 1. For 'mean', a
1952  * calculator needs to track occurrence count and sum. Calculators produce the final
1953  * value when all processing is finished.
1954  *
1955  * Summary field headers
1956  *
1957  * There are several options for specifying summary field headers. The defaults combine the
1958  * operator name and the header of the field summarized. The defaults can be overridden on
1959  * on the command line. These scenarios are supported via the operator constructor and the
1960  * processHeaderLine() method.
1961  *
1962  * Missing field policy
1963  *
1964  * At present, tsv-summarize has a single policy for handling missing values that applies
1965  * to all operators. However, it is logically operator specific and is implemented that
1966  * way. The MissingFieldPolicy struct describes the policy, each operator contains one.
1967  * Calculators access thier operator's policy struct.
1968  */
1969 
1970 /** An Operator represents a summary calculation specified on the command line.
1971  *  e.g. '--mean 5'.
1972  */
1973 interface Operator
1974 {
1975     @property string header();
1976     @property string name();
1977     void processHeaderLine(const char[][] fields);
1978     size_t[] numericFieldsToSave();     // Numeric fields this Operator needs saved
1979     size_t[] textFieldsToSave();        // Text fields this Operator needs saved
1980     Calculator makeCalculator();
1981 }
1982 
1983 /** Calculators are responsible for the calculation of a single computation. They
1984  *  process each line and produce the final value when all processing is finished.
1985  */
1986 interface Calculator
1987 {
1988     void processNextLine(const char[][] fields);
1989     string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions);
1990 }
1991 
1992 /** This class describes processing behavior when a missing value is encountered.
1993  */
1994 final class MissingFieldPolicy
1995 {
1996     private bool _useMissing = true;          // True if missing values are processed unchanged.
1997     private bool _replaceMissing = false;     // True if missing values are replaced.
1998     private string _missingReplacement;       // Replacement string if replaceMissing is true.
1999 
2000     this (const bool excludeMissing = false, string missingReplacement = "")
2001     {
2002         updatePolicy(excludeMissing, missingReplacement);
2003     }
2004 
2005     void updatePolicy(const bool excludeMissing, string missingReplacement)
2006     {
2007         _missingReplacement = missingReplacement;
2008         _replaceMissing = missingReplacement.length != 0;
2009         _useMissing = !excludeMissing && !replaceMissing;
2010     }
2011 
2012     final bool isMissingField(const char[] field) const
2013     {
2014         return field.length == 0;
2015     }
2016 
2017     final bool useMissing() const @property
2018     {
2019         return _useMissing;
2020     }
2021 
2022     final bool excludeMissing() const @property
2023     {
2024         return !_useMissing && !_replaceMissing;
2025     }
2026 
2027     final bool replaceMissing() const @property
2028     {
2029         return _replaceMissing;
2030     }
2031 
2032     final string missingReplacement() const @property
2033     {
2034         return _missingReplacement;
2035     }
2036 }
2037 
2038 /* The SharedFieldValues and UniqueKeyValuesLists classes manage lists of values collected
2039  * while reading data. Operations like median collect all values and operate on them when
2040  * running the final calculation. Value lists are needed for each unique key. A command
2041  * using multiple Operators may save multiple fields. And, different Operators may be run
2042  * against the same field.
2043  *
2044  * The last part motivates these classes. Handling large data sets necessitates minimizing
2045  * in-memory storage, making it desirable to share identical lists between Calculators.
2046  * Otherwise, each Calculator could implement its own storage, which would be simpler.
2047  *
2048  * The setup works as follows:
2049  *  - Operators advertise fields they need saved ([text|numeric]FieldsToSave methods).
2050  *  - The SummarizerBase object keeps a SharedFieldValues object, which in turn keeps list
2051  *    of the fields advertised by Operators as needing sharing. This list gets created
2052  *    during command initialization (SummarizerBase.setOperators).
2053  *  - The SharedFieldValues object is used to create a UniqueKeyValuesLists object every
2054  *    time a new unique key is found, in parellel to the Calculator objects created for the
2055  *    key. The UniqueKeyValuesLists objects are managed by derived Summarizer classes.
2056  *  - A unique key's UniqueKeyValuesLists object is passed each input line, same as
2057  *    Calculators, saving the values.
2058  *  - Calculators retrieve the saved values during the calculation phase. The calculator's
2059  *    ProcessNextField method is typically a no-op.
2060  *  - Calculators cannot make assumptions about the order of the saved values. This is
2061  *    pragmatic concession to median and quantile calculations, which need to sort the data,
2062  *    at least partially. Rather than generate sorted copies, the current algorithms
2063  *    sort the data in place.
2064  *
2065  * One concession to duplicate storage is that text and numeric versions of the same
2066  * field might be stored. The reason is because it's important to convert text to numbers
2067  * as they are read so that useful error messages can be generated. And, storing both
2068  * forms of the same field should be less common.
2069  *
2070  * The current implementation uses the same missing values policy for all fields. If
2071  * multiple policies become supported this will need to change.
2072  *
2073  * Built-in calculations - UniqueKeyValueLists have a built-in median operation. This is
2074  * to avoid repeated calculations of the median by different calculations.
2075  */
2076 
2077 final class SharedFieldValues
2078 {
2079     // Arrays with field indices that need to be saved.
2080     private size_t[] _numericFieldIndices;
2081     private size_t[] _textFieldIndices;
2082 
2083     /* Called during summarizer setup to add a shared field value for a specific field index.
2084      * eg. '--median 7' will add end up calling addNumericIdex(6), 6 being the zero-based index.
2085      * A specific index is only added once.
2086      */
2087     final void addNumericIndex (size_t index)
2088     {
2089         if (!canFind(_numericFieldIndices, index)) _numericFieldIndices ~= index;
2090     }
2091 
2092     /* Similar to addNumericIndex, except adds a text index. */
2093     final void addTextIndex (size_t index)
2094     {
2095         if (!canFind(_textFieldIndices, index)) _textFieldIndices ~= index;
2096     }
2097 
2098     /* Called every time a new key is found, or once at the beginning of the program if no keys
2099      * are being used (entire column summarized).
2100      */
2101     final UniqueKeyValuesLists makeUniqueKeyValuesLists()
2102     {
2103         return new UniqueKeyValuesLists(_numericFieldIndices, _textFieldIndices);
2104     }
2105 }
2106 
2107 final class UniqueKeyValuesLists
2108 {
2109     /* A FieldValues object holds is a list of values collect for a specific field. A
2110      * unique key may hold several. For example, the command:
2111      *     $ tsv-summarize --k 1 --median 4 -- median 5
2112      * requires keeping lists for both fields 4 and 5. This in turn will result in a
2113      * _numericFieldValues being a 2 element array, one with a list of field 4 values,
2114      * the second of field 5 values. Linear search is used to find a specific field.
2115      */
2116     private FieldValues!double[] _numericFieldValues;
2117     private FieldValues!string[] _textFieldValues;
2118     private double[] _numericFieldMedians;
2119 
2120     /* The UniqueKeyValuesLists constructor takes arrays of field indices to be saved. */
2121     this(const size_t[] numericFieldIndices, const size_t[] textFieldIndices)
2122     {
2123         if (numericFieldIndices.length > 0)
2124         {
2125             _numericFieldValues = new FieldValues!double[](numericFieldIndices.length);
2126             foreach (i, fieldIndex; numericFieldIndices)
2127                 _numericFieldValues[i] = new FieldValues!double(fieldIndex);
2128         }
2129 
2130         if (textFieldIndices.length > 0)
2131         {
2132             _textFieldValues = new FieldValues!string[](textFieldIndices.length);
2133             foreach (i, fieldIndex; textFieldIndices)
2134                 _textFieldValues[i] = new FieldValues!string(fieldIndex);
2135         }
2136     }
2137 
2138     void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy)
2139     {
2140         _numericFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy));
2141         _textFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy));
2142     }
2143 
2144     private FieldValues!double findNumericFieldValues(size_t index)
2145     {
2146         alias pred = (FieldValues!double a, size_t b) => (a.fieldIndex == b);
2147         auto r = find!pred(_numericFieldValues, index);
2148         assert(!r.empty);
2149         return r.front;
2150     }
2151 
2152     private FieldValues!string findTextFieldValues(size_t index)
2153     {
2154         alias pred = (FieldValues!string a, size_t b) => (a.fieldIndex == b);
2155         auto r = find!pred(_textFieldValues, index);
2156         assert(!r.empty);
2157         return r.front;
2158     }
2159 
2160     final double[] numericValues(size_t index)
2161     {
2162         return findNumericFieldValues(index).getArray;
2163     }
2164 
2165     final double[] numericValuesSorted(size_t index)
2166     {
2167         return findNumericFieldValues(index).getSortedArray;
2168     }
2169 
2170     final string[] textValues(size_t index)
2171     {
2172         return findTextFieldValues(index).getArray;
2173     }
2174 
2175     final string[] textValuesSorted(size_t index)
2176     {
2177         return findTextFieldValues(index).getSortedArray;
2178     }
2179 
2180     final double numericValuesMedian(size_t index)
2181     {
2182         return findNumericFieldValues(index).median;
2183     }
2184 
2185     private final class FieldValues(ValueType)
2186     {
2187         import std.array : appender;
2188         private size_t _fieldIndex;
2189         private Appender!(ValueType[]) _values;
2190         private bool _haveMedian = false;
2191         private bool _isSorted = false;
2192         private ValueType _medianValue;
2193 
2194         this(size_t fieldIndex)
2195         {
2196             _fieldIndex = fieldIndex;
2197         }
2198 
2199         final size_t length() const @property
2200         {
2201             return _values.data.length;
2202         }
2203 
2204         final size_t fieldIndex() const @property
2205         {
2206             return _fieldIndex;
2207         }
2208 
2209         final void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy)
2210         {
2211             debug writefln("[%s]: %s", __FUNCTION__, fields.to!string);
2212 
2213             const char[] field = fields[_fieldIndex];
2214             if (missingPolicy.useMissing || !missingPolicy.isMissingField(field))
2215             {
2216                 _values.put(field.to!ValueType);
2217                 _haveMedian = false;
2218                 _isSorted = false;
2219             }
2220             else if (missingPolicy.replaceMissing)
2221             {
2222                 _values.put(missingPolicy.missingReplacement.to!ValueType);
2223                 _haveMedian = false;
2224                 _isSorted = false;
2225             }
2226         }
2227 
2228         /* Return an input range of the values. */
2229         final auto values()
2230         {
2231             return _values.data;
2232         }
2233 
2234         final ValueType[] getArray()
2235         {
2236             return _values.data;
2237         }
2238 
2239         final ValueType[] getSortedArray()
2240         {
2241             if (!_isSorted)
2242             {
2243                 import std.algorithm : sort;
2244                 sort(_values.data);
2245                 _isSorted = true;
2246             }
2247             return _values.data;
2248         }
2249 
2250         final ValueType median()
2251         {
2252             if (!_haveMedian)
2253             {
2254                 import tsv_utils.common.numerics : rangeMedian;
2255                 _medianValue = _values.data.rangeMedian();
2256                 _haveMedian = true;
2257             }
2258 
2259             return _medianValue;
2260         }
2261     }
2262 }
2263 
2264 /** SingleFieldOperator is a base class for single field operators, the most common
2265  * Operator. Derived classes implement makeCalculator and the Calculator class it returns.
2266  */
2267 class SingleFieldOperator : Operator
2268 {
2269     import std.typecons : Flag;
2270 
2271     private string _name;
2272     private string _header;
2273     private size_t _fieldIndex;
2274     private bool _useHeaderSuffix;
2275     private bool _allowCustomHeader;
2276     private bool _hasCustomHeader = false;
2277     private size_t[] _numericFieldsToSave;
2278     private size_t[] _textFieldsToSave;
2279     private MissingFieldPolicy _missingPolicy;
2280 
2281     this(string operatorName, size_t fieldIndex, MissingFieldPolicy missingPolicy,
2282          Flag!"useHeaderSuffix" useHeaderSuffix = Yes.useHeaderSuffix,
2283          Flag!"allowCustomHeader" allowCustomHeader = Yes.allowCustomHeader)
2284     {
2285         _name = operatorName;
2286         _fieldIndex = fieldIndex;
2287         _missingPolicy = missingPolicy;
2288         _useHeaderSuffix = useHeaderSuffix;
2289         _allowCustomHeader = allowCustomHeader;
2290         // Default header. May be overrridden by custom header or header line.
2291         _header =
2292             fieldHeaderFromIndex(fieldIndex)
2293             .summaryHeaderFromFieldHeader(_useHeaderSuffix ? operatorName : "");
2294     }
2295 
2296     void setCustomHeader (string customHeader)
2297     {
2298         assert(_allowCustomHeader);
2299         _header = customHeader;
2300         _hasCustomHeader = true;
2301     }
2302 
2303     final string name() const @property
2304     {
2305         return _name;
2306     }
2307 
2308     final bool allowCustomHeader() const @property
2309     {
2310         return _allowCustomHeader;
2311     }
2312 
2313     /* saveFieldValues[Numeric|Text] are called by derived classes to indicate that field
2314      * that the field values should be saved. These should called during construction.
2315      */
2316     final void setSaveFieldValuesNumeric()
2317     {
2318         _numericFieldsToSave ~= _fieldIndex;
2319     }
2320 
2321     final void setSaveFieldValuesText()
2322     {
2323         _textFieldsToSave ~= _fieldIndex;
2324     }
2325 
2326     final MissingFieldPolicy missingPolicy() @property
2327     {
2328         return _missingPolicy;
2329     }
2330 
2331     final size_t fieldIndex() const @property
2332     {
2333         return _fieldIndex;
2334     }
2335 
2336     final string header() const @property
2337     {
2338         return _header;
2339     }
2340 
2341     final bool useHeaderSuffix() const @property
2342     {
2343         return _useHeaderSuffix;
2344     }
2345 
2346     void processHeaderLine(const char[][] fields)
2347     {
2348         if (!_hasCustomHeader) {
2349             debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string);
2350             _header = summaryHeaderFromFieldHeader(fields[_fieldIndex].to!string,
2351                                                    _useHeaderSuffix ? _name : "");
2352         }
2353     }
2354 
2355     final size_t[] numericFieldsToSave()
2356     {
2357         return _numericFieldsToSave;
2358     }
2359 
2360     final size_t[] textFieldsToSave()
2361     {
2362         return _textFieldsToSave;
2363     }
2364 
2365     abstract SingleFieldCalculator makeCalculator();
2366 }
2367 
2368 /** SingleFieldCalculator is a base class for the common case of calculators using a single
2369  * field. Derived classes implement processNextField() rather than processNextLine().
2370  */
2371 class SingleFieldCalculator : Calculator
2372 {
2373     private size_t _fieldIndex;
2374 
2375     this(size_t fieldIndex)
2376     {
2377         _fieldIndex = fieldIndex;
2378     }
2379 
2380     final size_t fieldIndex() const @property
2381     {
2382         return _fieldIndex;
2383     }
2384 
2385     final void processNextLine(const char[][] fields)
2386     {
2387         debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string);
2388 
2389         auto missingPolicy = getOperator.missingPolicy;
2390         const char[] field = fields[_fieldIndex];
2391 
2392         if (missingPolicy.useMissing || !missingPolicy.isMissingField(field))
2393         {
2394             processNextField(field);
2395         }
2396         else if (missingPolicy.replaceMissing)
2397         {
2398             processNextField(missingPolicy.missingReplacement);
2399         }
2400     }
2401 
2402     abstract SingleFieldOperator getOperator();
2403 
2404     abstract void processNextField(const char[] field);
2405 }
2406 
2407 /* Unittest helper functions. Only compiled when -unittest is in effect. */
2408 version(unittest)
2409 {
2410     /** A helper for SingleFieldOperator unit tests.
2411      *
2412      * testSingleFieldOperator takes a set of split file values, a field index, a header
2413      * suffix, and a set of expected values. The expected values array contains the
2414      * initial value (zero entries) and the expected values after each line. (One more
2415      * expected value than input lines.) The zero entry case is what is generated for an
2416      * empty file. An example testing the 'min' operator against a file with 2 columns,
2417      * 3 rows, using field index 1:
2418      *
2419      *    testSingleFieldOperator!MinOperator(
2420      *       [["10", "100"],               // The split file. 3 lines by 2 rows.
2421      *        ["5", "50"],
2422      *        ["20", "200"]],
2423      *       1,                            // Field index (zero-based, so "100", "50", "200")
2424      *       "min",                        // The header suffix, normally the operator name.
2425      *       ["nan", "100", "50", "50"]);  // Min value after processing each line.
2426      *
2427      * A typical operator unit test uses three "files", one each of 1x3, 2x3, and 3x3.
2428      * Then run the operator is tested against each column, a total of six calls. Headers
2429      * are automatically checked. Additional entries can be used to extend coverage.
2430      *
2431      * A non-default MissingFieldPolicy can be provide as an optional last argument.
2432      * Operator tests should include exclusion and replacement variations. See operator
2433      * unit tests for details.
2434      *
2435      * The testSingleFieldOperatorBase adds an additional capability - Custom operator
2436      * init arguments. Currently this is used only by the quantile operator.
2437      *
2438      * These tests do not check unique key behavior (group-by). Operators don't have info
2439      * about unique keys, and interact with them only indirectly, via Calculators.
2440      */
2441     void testSingleFieldOperator(OperatorClass : SingleFieldOperator)
2442         (const char[][][] splitFile, size_t fieldIndex, string headerSuffix,
2443          const char[][] expectedValues,
2444          MissingFieldPolicy missingPolicy = new MissingFieldPolicy)
2445     {
2446         testSingleFieldOperatorBase!OperatorClass(splitFile, fieldIndex, headerSuffix, expectedValues, missingPolicy);
2447     }
2448 
2449     void testSingleFieldOperatorBase(OperatorClass : SingleFieldOperator, T...)
2450         (const char[][][] splitFile, size_t fieldIndex, string headerSuffix,
2451          const char[][] expectedValues,
2452          MissingFieldPolicy missingPolicy,
2453          T extraOpInitArgs)
2454     {
2455         import std.format : format;
2456         import std.array : appender;
2457         import std..string : chomp;
2458         import std.traits : EnumMembers;
2459 
2460         auto numFields = (splitFile[0]).length;
2461 
2462         assert(fieldIndex < numFields,
2463                format("[testSingleFieldOperator] Invalid field index. headerSuffix: %s",
2464                       headerSuffix));
2465         assert(splitFile.length + 1 == expectedValues.length,
2466                format("[testSingleFieldOperator] Need one more expected value than number of rows. headerSuffix: %s",
2467                       headerSuffix));
2468 
2469         /* printOptions - Only the 'values-delimiter' (2nd arg) is used these tests. */
2470         auto printOptions = SummarizerPrintOptions('#', '|');
2471 
2472         /* An input header line. */
2473         string[] inputHeaderLine = new string[numFields];
2474         foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string;
2475 
2476         /* The different expected output field headers. */
2477         auto outputFieldHeaderWithNoHeaderLine =
2478             fieldHeaderFromIndex(fieldIndex)
2479             .summaryHeaderFromFieldHeader(headerSuffix);
2480         auto outputFieldHeaderFromHeaderLine =
2481             inputHeaderLine[fieldIndex]
2482             .summaryHeaderFromFieldHeader(headerSuffix);
2483         auto customOutputFieldHeader = "custom";
2484 
2485         enum HeaderUsecase {
2486             HeaderLine_DefaultHeader,
2487             HeaderLine_CustomHeader,
2488             NoHeaderLine_DefaultHeader,
2489             NoHeaderLine_CustomHeader,
2490             NoHeaderLine_NoOutputHeader,
2491         }
2492 
2493         string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected)
2494         {
2495             return format("[testSingleFieldOperator] Unexpected header. Operator: %s; Usecase: %s;  Actual: '%s';  Expected: '%s'",
2496                           op.name, hc, actual, expected);
2497         }
2498 
2499         string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex, size_t fieldIndex,
2500                                   const char[] actual, const char[] expected)
2501         {
2502             return format("[testSingleFieldOperator] Unexpected value. Operator: %s; Usecase: %s;  RowIndex: %d, FieldIndex: %d\n    Actual: '%s';  Expected: '%s'",
2503                           op.name, hc, rowIndex, fieldIndex, actual, expected);
2504         }
2505 
2506         /* Run the logic for each header use case. */
2507         foreach (hc; EnumMembers!HeaderUsecase)
2508         {
2509             bool hasInputHeader = (
2510                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2511                 hc == HeaderUsecase.HeaderLine_CustomHeader
2512                 );
2513             bool hasOutputHeader = (
2514                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2515                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2516                 hc == HeaderUsecase.NoHeaderLine_DefaultHeader ||
2517                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2518                 );
2519             bool hasCustomHeader = (
2520                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2521                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2522                 );
2523 
2524             if (hasCustomHeader) assert(hasOutputHeader);
2525 
2526             auto op = new OperatorClass(fieldIndex, missingPolicy, extraOpInitArgs);
2527 
2528             if (hasCustomHeader)
2529             {
2530                 if (!op.allowCustomHeader) continue;   // Custom header not support by this operator
2531                 op.setCustomHeader(customOutputFieldHeader);
2532             }
2533 
2534             Operator[] operatorArray;
2535             operatorArray ~= op;
2536 
2537             auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy);
2538             summarizer.setOperators(inputRangeObject(operatorArray));
2539 
2540             if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine);
2541 
2542             if (hasOutputHeader)
2543             {
2544                 /* Write the header line. Note that this is a one-field header, */
2545                 auto headerLineOutput = appender!(char[])();
2546                 summarizer.writeSummaryHeader(headerLineOutput, printOptions);
2547 
2548                 /* Test that the header was generated correctly.
2549                  *
2550                  * Note: Because the output is generated by a Summarizer, it will have a
2551                  * trailing newline. Use chomp to trim it.
2552                  */
2553                 final switch (hc)
2554                 {
2555                 case HeaderUsecase.HeaderLine_DefaultHeader:
2556                     assert(headerLineOutput.data.chomp == outputFieldHeaderFromHeaderLine,
2557                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2558                                                outputFieldHeaderFromHeaderLine));
2559                     break;
2560                 case HeaderUsecase.NoHeaderLine_DefaultHeader:
2561                     assert(headerLineOutput.data.chomp == outputFieldHeaderWithNoHeaderLine,
2562                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2563                                                outputFieldHeaderWithNoHeaderLine));
2564                     break;
2565                 case HeaderUsecase.HeaderLine_CustomHeader:
2566                 case HeaderUsecase.NoHeaderLine_CustomHeader:
2567                     assert(headerLineOutput.data.chomp == customOutputFieldHeader,
2568                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2569                                                customOutputFieldHeader));
2570                     break;
2571                 case HeaderUsecase.NoHeaderLine_NoOutputHeader:
2572                     break;
2573                }
2574 
2575             }
2576 
2577             /* For each line, process the line, generate the output, and test that the
2578              * value is correct. Start with the empty file case.
2579              */
2580             foreach (i, const char[] expected; expectedValues)
2581             {
2582                 if (i > 0) summarizer.processNextLine(splitFile[i - 1]);
2583                 auto summaryLineOutput = appender!(char[])();
2584                 summarizer.writeSummaryBody(summaryLineOutput, printOptions);
2585                 assert(summaryLineOutput.data.chomp == expected,
2586                        valueAssertMessage(operatorArray[0], hc, i, fieldIndex,
2587                                           summaryLineOutput.data.chomp, expectedValues[i]));
2588             }
2589         }
2590     }
2591 }
2592 
2593 /** ZeroFieldOperator is a base class for operators that take no input. The main use
2594  * case is the CountOperator, which counts the occurrences of each unique key. Other
2595  * uses are possible, for example, weighted random number assignment.
2596  *
2597  * The primary rationale for ZeroFieldOperator and ZeroFieldCalculator is to clarify
2598  * the information available to such a routine. In particular, the split fields passed
2599  * to processHeaderLine and processNextLine don't include all fields in the input,
2600  * something that might not be obvious when implementing an operator. (Only fields
2601  * required by operators acting on specific fields are included.)
2602  */
2603 class ZeroFieldOperator : Operator
2604 {
2605     import std.typecons : Flag;
2606 
2607     private string _name;
2608     private string _header;
2609 
2610     this(string operatorName)
2611     {
2612         _name = operatorName;
2613         _header = operatorName;
2614     }
2615 
2616     void setCustomHeader (string customHeader)
2617     {
2618         _header = customHeader;
2619     }
2620 
2621     bool allowCustomHeader() const @property
2622     {
2623         return true;
2624     }
2625 
2626     final string name() const @property
2627     {
2628         return _name;
2629     }
2630 
2631     final string header() const @property
2632     {
2633         return _header;
2634     }
2635 
2636     /* A no-op. ZeroFieldOperators have no access to the header line. */
2637     final void processHeaderLine(const char[][] fields) { }
2638 
2639     /* A no-op. ZeroFieldOperators have no access to fields. */
2640     final size_t[] numericFieldsToSave()
2641     {
2642         size_t[] emptyArray;
2643         return emptyArray;
2644     }
2645 
2646     /* A no-op. ZeroFieldOperators have no access to fields. */
2647     final size_t[] textFieldsToSave()
2648     {
2649         size_t[] emptyArray;
2650         return emptyArray;
2651     }
2652 
2653     abstract ZeroFieldCalculator makeCalculator();
2654 }
2655 
2656 /** ZeroFieldCalculator is a base class for operators that don't use fields as input.
2657  * In particular, the Count operator. It is a companion to the ZeroFieldOperator class.
2658  *
2659  * Derived classes implement processNextEntry() rather than processNextLine(), and the
2660  * single argument form of calculate() given as an abstract function.
2661  */
2662 class ZeroFieldCalculator : Calculator
2663 {
2664     this() { }
2665 
2666     final void processNextLine(const char[][] fields)
2667     {
2668         debug writefln("[%s]", __FUNCTION__,);
2669         processNextEntry();
2670     }
2671 
2672     final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2673     {
2674         return calculate(printOptions);
2675     }
2676 
2677     abstract void processNextEntry();
2678     abstract string calculate(const ref SummarizerPrintOptions printOptions);
2679 }
2680 
2681 version(unittest)
2682 {
2683     /* A helper for ZeroFieldOperator unit tests.
2684      *
2685      * testZeroFieldOperator takes a set of split file values, a default header, and a
2686      * set of expected values. The expected values array contains the expected values
2687      * after each line.
2688      *
2689      * testZeroFieldOperator is very similar to testSingleFieldOperator, except that
2690      * there is no use of field indices and fewer types of headers. See the latter's
2691      * documentation and the CountOperator unit tests for examples.
2692      */
2693     void testZeroFieldOperator(OperatorClass : ZeroFieldOperator)
2694         (const char[][][] splitFile, string defaultHeader, const char[][] expectedValues)
2695     {
2696         import std.format : format;
2697         import std.array : appender;
2698         import std..string : chomp;
2699         import std.traits : EnumMembers;
2700 
2701         auto numFields = (splitFile[0]).length;
2702 
2703         assert(splitFile.length + 1 == expectedValues.length,
2704                format("[testZeroFieldOperator] Need one more expected value than number of rows. headerSuffix: %s",
2705                       defaultHeader));
2706 
2707         /* printOptions - Not used these tests, but needed for API calls. */
2708         auto printOptions = SummarizerPrintOptions('#', '|');
2709 
2710         /* Missing policy doesn't apply to zero field operators, but need the object for the summarizer. */
2711         auto missingPolicy = new MissingFieldPolicy;
2712 
2713         /* An input header line. */
2714         string[] inputHeaderLine = new string[numFields];
2715         foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string;
2716 
2717         auto customOutputFieldHeader = "custom";
2718 
2719         enum HeaderUsecase {
2720             HeaderLine_DefaultHeader,
2721             HeaderLine_CustomHeader,
2722             NoHeaderLine_DefaultHeader,
2723             NoHeaderLine_CustomHeader,
2724             NoHeaderLine_NoOutputHeader,
2725         }
2726 
2727         string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected)
2728         {
2729             return format("[testZeroFieldOperator] Unexpected header. Operator: %s; Usecase: %s;  Actual: '%s';  Expected: '%s'",
2730                           op.name, hc, actual, expected);
2731         }
2732 
2733         string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex,
2734                                   const char[] actual, const char[] expected)
2735         {
2736             return format("[testZeroFieldOperator] Unexpected value. Operator: %s; Usecase: %s;  RowIndex: %d\n    Actual: '%s';  Expected: '%s'",
2737                           op.name, hc, rowIndex, actual, expected);
2738         }
2739 
2740         /* Run the logic for each header use case. */
2741         foreach (hc; EnumMembers!HeaderUsecase)
2742         {
2743             bool hasInputHeader = (
2744                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2745                 hc == HeaderUsecase.HeaderLine_CustomHeader
2746                 );
2747             bool hasOutputHeader = (
2748                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2749                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2750                 hc == HeaderUsecase.NoHeaderLine_DefaultHeader ||
2751                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2752                 );
2753             bool hasCustomHeader = (
2754                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2755                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2756                 );
2757 
2758             if (hasCustomHeader) assert(hasOutputHeader);
2759 
2760             auto op = new OperatorClass();
2761 
2762             if (hasCustomHeader)
2763             {
2764                 if (!op.allowCustomHeader) continue;   // Custom header not support by this operator
2765                 op.setCustomHeader(customOutputFieldHeader);
2766             }
2767 
2768             Operator[] operatorArray;
2769             operatorArray ~= op;
2770 
2771             auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy);
2772             summarizer.setOperators(inputRangeObject(operatorArray));
2773             if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine);
2774 
2775             if (hasOutputHeader)
2776             {
2777                 /* Write the header line. Note that this is a one-field header, */
2778                 auto headerLineOutput = appender!(char[])();
2779                 summarizer.writeSummaryHeader(headerLineOutput, printOptions);
2780 
2781                 /* Test that the header was generated correctly.
2782                  *
2783                  * Note: Because the output is generated by a Summarizer, it will have a
2784                  * trailing newline. Use chomp to trim it.
2785                  */
2786                 final switch (hc)
2787                 {
2788                 case HeaderUsecase.HeaderLine_DefaultHeader:
2789                 case HeaderUsecase.NoHeaderLine_DefaultHeader:
2790                     assert(headerLineOutput.data.chomp == defaultHeader,
2791                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2792                                                defaultHeader));
2793                     break;
2794                 case HeaderUsecase.HeaderLine_CustomHeader:
2795                 case HeaderUsecase.NoHeaderLine_CustomHeader:
2796                     assert(headerLineOutput.data.chomp == customOutputFieldHeader,
2797                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2798                                                customOutputFieldHeader));
2799                     break;
2800                 case HeaderUsecase.NoHeaderLine_NoOutputHeader:
2801                     break;
2802                 }
2803 
2804             }
2805 
2806             /* For each line, process the line, generate the output, and test that the
2807              * value is correct. Start with the empty file case.
2808              */
2809             foreach (i, const char[] expected; expectedValues)
2810             {
2811                 if (i > 0) summarizer.processNextLine(splitFile[i - 1]);
2812                 auto summaryLineOutput = appender!(char[])();
2813                 summarizer.writeSummaryBody(summaryLineOutput, printOptions);
2814                 assert(summaryLineOutput.data.chomp == expected,
2815                        valueAssertMessage(operatorArray[0], hc, i,
2816                                           summaryLineOutput.data.chomp, expectedValues[i]));
2817             }
2818         }
2819     }
2820 }
2821 
2822 /* Specific operators.
2823  *
2824  * Notes:
2825  * - The 'Calculator' inner classes are 'static'. This means inner class instances do not
2826  *   keep a reference to the context of the outer class. In exchange, Calculator instances
2827  *   need to hold all needed state, typically the field index they are summarizing.
2828  */
2829 
2830 /** CountOperator counts the number of occurrences of each unique key, or the number of
2831  * input lines if there is no unique key.
2832  *
2833  * CountOperator differs from most other operators in that it doesn't summarize a specific
2834  * field on the line. Instead it is summarizing a property of the unique key itself. For
2835  * this reason it doesn't derive from SingleFieldOperator.
2836  */
2837 final class CountOperator : ZeroFieldOperator
2838 {
2839     this()
2840     {
2841         super("count");
2842     }
2843 
2844     final override ZeroFieldCalculator makeCalculator()
2845     {
2846         return new CountCalculator();
2847     }
2848 
2849     static final class CountCalculator : ZeroFieldCalculator
2850     {
2851         private size_t _count = 0;
2852 
2853         final override void processNextEntry()
2854         {
2855             _count++;
2856         }
2857 
2858         final override string calculate(const ref SummarizerPrintOptions printOptions)
2859         {
2860             return printOptions.formatNumber(_count);
2861         }
2862     }
2863 }
2864 
2865 unittest // CountOperator
2866 {
2867     auto col1File = [["10"], ["9.5"], ["11"]];
2868     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
2869     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
2870 
2871     testZeroFieldOperator!CountOperator(col1File, "count", ["0", "1", "2", "3"]);
2872     testZeroFieldOperator!CountOperator(col2File, "count", ["0", "1", "2", "3"]);
2873     testZeroFieldOperator!CountOperator(col3File, "count", ["0", "1", "2", "3"]);
2874 }
2875 
2876 /** RetainOperator retains the first occurrence of a field, without changing the header.
2877  *
2878  * RetainOperator is intended for fields where the value is expected to be the same for
2879  * all occurrences of the unique key, and the goal is to pass the value through unchanged.
2880  * It is like FirstOperator, except that the original header is preserved. The original
2881  * header preservation is setup in the call to the SingleFieldOperation constructor.
2882  *
2883  * Notes:
2884  * - An option to signal an error if multiple values are encountered might be useful.
2885  */
2886 final class RetainOperator : SingleFieldOperator
2887 {
2888     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2889     {
2890         super("retain", fieldIndex, missingPolicy, No.useHeaderSuffix, No.allowCustomHeader);
2891     }
2892 
2893     final override SingleFieldCalculator makeCalculator()
2894     {
2895         return new RetainCalculator(fieldIndex);
2896     }
2897 
2898     final class RetainCalculator : SingleFieldCalculator
2899     {
2900         private bool _done = false;
2901         private string _value = "";
2902 
2903         this(size_t fieldIndex)
2904         {
2905             super(fieldIndex);
2906         }
2907 
2908         final override RetainOperator getOperator()
2909         {
2910             return this.outer;
2911         }
2912 
2913         final override void processNextField(const char[] nextField)
2914         {
2915             if (!_done)
2916             {
2917                 _value = nextField.to!string;
2918                 _done = true;
2919             }
2920         }
2921 
2922         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2923        {
2924             return _value;
2925         }
2926     }
2927 }
2928 
2929 unittest // RetainOperator
2930 {
2931     auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]];
2932     auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]];
2933     auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]];
2934 
2935     testSingleFieldOperator!RetainOperator(col1File, 0, "", ["", "r1c1", "r1c1", "r1c1"]);
2936     testSingleFieldOperator!RetainOperator(col2File, 0, "", ["", "r1c1", "r1c1", "r1c1"]);
2937     testSingleFieldOperator!RetainOperator(col2File, 1, "", ["", "r1c2", "r1c2", "r1c2"]);
2938     testSingleFieldOperator!RetainOperator(col3File, 0, "", ["", "r1c1", "r1c1", "r1c1"]);
2939     testSingleFieldOperator!RetainOperator(col3File, 1, "", ["", "r1c2", "r1c2", "r1c2"]);
2940     testSingleFieldOperator!RetainOperator(col3File, 2, "", ["", "r1c3", "r1c3", "r1c3"]);
2941 
2942     auto col1misFile = [[""], ["r2c1"], ["r3c1"]];
2943     testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "", "r2c1", "r2c1"],
2944                                            new MissingFieldPolicy(true, ""));  // Exclude missing
2945     testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "NA", "NA", "NA"],
2946                                            new MissingFieldPolicy(false, "NA"));  // Replace missing
2947 }
2948 
2949 /** FirstOperator outputs the first value found for the field.
2950  */
2951 final class FirstOperator : SingleFieldOperator
2952 {
2953     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2954     {
2955         super("first", fieldIndex, missingPolicy);
2956     }
2957 
2958     final override SingleFieldCalculator makeCalculator()
2959     {
2960         return new FirstCalculator(fieldIndex);
2961     }
2962 
2963     final class FirstCalculator : SingleFieldCalculator
2964     {
2965         private bool _done = false;
2966         private string _value = "";
2967 
2968         this(size_t fieldIndex)
2969         {
2970             super(fieldIndex);
2971         }
2972 
2973         final override FirstOperator getOperator()
2974         {
2975             return this.outer;
2976         }
2977 
2978         final override void processNextField(const char[] nextField)
2979         {
2980             if (!_done)
2981             {
2982                 _value = nextField.to!string;
2983                 _done = true;
2984             }
2985         }
2986 
2987         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2988         {
2989             return _value;
2990         }
2991     }
2992 }
2993 
2994 unittest // FirstOperator
2995 {
2996     auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]];
2997     auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]];
2998     auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]];
2999 
3000     testSingleFieldOperator!FirstOperator(col1File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]);
3001     testSingleFieldOperator!FirstOperator(col2File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]);
3002     testSingleFieldOperator!FirstOperator(col2File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]);
3003     testSingleFieldOperator!FirstOperator(col3File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]);
3004     testSingleFieldOperator!FirstOperator(col3File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]);
3005     testSingleFieldOperator!FirstOperator(col3File, 2, "first", ["", "r1c3", "r1c3", "r1c3"]);
3006 
3007     auto col1misFile = [[""], ["r2c1"], ["r3c1"]];
3008     testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "", "r2c1", "r2c1"],
3009                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3010     testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "NA", "NA", "NA"],
3011                                           new MissingFieldPolicy(false, "NA"));  // Replace missing
3012 }
3013 
3014 /** LastOperator outputs the last value found for the field.
3015  */
3016 final class LastOperator : SingleFieldOperator
3017 {
3018     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3019     {
3020         super("last", fieldIndex, missingPolicy);
3021     }
3022 
3023     final override SingleFieldCalculator makeCalculator()
3024     {
3025         return new LastCalculator(fieldIndex);
3026     }
3027 
3028     final class LastCalculator : SingleFieldCalculator
3029     {
3030         private string _value = "";
3031 
3032         this(size_t fieldIndex)
3033         {
3034             super(fieldIndex);
3035         }
3036 
3037         final override LastOperator getOperator()
3038         {
3039             return this.outer;
3040         }
3041 
3042         final override void processNextField(const char[] nextField)
3043         {
3044             _value = nextField.to!string;
3045         }
3046 
3047         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3048         {
3049             return _value;
3050         }
3051     }
3052 }
3053 
3054 unittest // LastOperator
3055 {
3056     auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]];
3057     auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]];
3058     auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]];
3059 
3060     testSingleFieldOperator!LastOperator(col1File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]);
3061     testSingleFieldOperator!LastOperator(col2File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]);
3062     testSingleFieldOperator!LastOperator(col2File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]);
3063     testSingleFieldOperator!LastOperator(col3File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]);
3064     testSingleFieldOperator!LastOperator(col3File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]);
3065     testSingleFieldOperator!LastOperator(col3File, 2, "last", ["", "r1c3", "r2c3", "r3c3"]);
3066 
3067     auto col1misFile = [[""], ["r2c1"], ["r3c1"]];
3068     testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "", "r2c1", "r3c1"],
3069                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3070     testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "NA", "r2c1", "r3c1"],
3071                                           new MissingFieldPolicy(false, "NA"));  // Replace missing
3072 }
3073 
3074 /** MinOperator output the minimum value for the field. This is a numeric operator.
3075  *
3076  * This operator returns the original string without additional numeric formatting.
3077  * This can be useful when joining back to the original data. This is different than
3078  * numeric operators that perform calculations.
3079  */
3080 final class MinOperator : SingleFieldOperator
3081 {
3082     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3083     {
3084         super("min", fieldIndex, missingPolicy);
3085     }
3086 
3087     final override SingleFieldCalculator makeCalculator()
3088     {
3089         return new MinCalculator(fieldIndex);
3090     }
3091 
3092     final class MinCalculator : SingleFieldCalculator
3093     {
3094         private bool _isFirst = true;
3095         private double _value = double.nan;
3096         private string _originalString = "nan";  // Note: Cannot format floats at compile time (version 2.087)
3097 
3098         this(size_t fieldIndex)
3099         {
3100             super(fieldIndex);
3101         }
3102 
3103         final override MinOperator getOperator()
3104         {
3105             return this.outer;
3106         }
3107 
3108         final override void processNextField(const char[] nextField)
3109         {
3110             double fieldValue = nextField.to!double;
3111             if (_isFirst)
3112             {
3113                 _value = fieldValue;
3114                 _originalString = nextField.to!string;
3115                 _isFirst = false;
3116             }
3117             else if (fieldValue < _value)
3118             {
3119                 _value = fieldValue;
3120                 _originalString = nextField.to!string;
3121             }
3122         }
3123 
3124         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3125         {
3126             return _originalString;
3127         }
3128     }
3129 }
3130 
3131 unittest // MinOperator
3132 {
3133     auto col1File = [["10"], ["9.5"], ["11"]];
3134     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3135     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
3136 
3137     testSingleFieldOperator!MinOperator(col1File, 0, "min", ["nan", "10", "9.5", "9.5"]);
3138     testSingleFieldOperator!MinOperator(col2File, 0, "min", ["nan", "20", "20", "20"]);
3139     testSingleFieldOperator!MinOperator(col2File, 1, "min", ["nan", "-30", "-30", "-31"]);
3140     testSingleFieldOperator!MinOperator(col3File, 0, "min", ["nan", "9009", "199", "199"]);
3141     testSingleFieldOperator!MinOperator(col3File, 1, "min", ["nan", "9", "0", "0"]);
3142     testSingleFieldOperator!MinOperator(col3File, 2, "min", ["nan", "-4.5", "-4.5", "-4.5"]);
3143 
3144     auto col1misFile = [[""], ["10"], ["-10"]];
3145     testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "nan", "10", "-10"],
3146                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3147     testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "5", "5", "-10"],
3148                                           new MissingFieldPolicy(false, "5"));  // Replace missing
3149 }
3150 
3151 /** MaxOperator output the maximum value for the field. This is a numeric operator.
3152  *
3153  * This operator returns the original string without additional numeric formatting.
3154  * This can be useful when joining back to the original data. This is different than
3155  * numeric operators that perform calculations.
3156  */
3157 final class MaxOperator : SingleFieldOperator
3158 {
3159     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3160     {
3161         super("max", fieldIndex, missingPolicy);
3162     }
3163 
3164     final override SingleFieldCalculator makeCalculator()
3165     {
3166         return new MaxCalculator(fieldIndex);
3167     }
3168 
3169     final class MaxCalculator : SingleFieldCalculator
3170     {
3171         private bool _isFirst = true;
3172         private double _value = double.nan;
3173         private string _originalString = "nan";  // Note: Cannot format floats at compile time (version 2.087)
3174 
3175         this(size_t fieldIndex)
3176         {
3177             super(fieldIndex);
3178         }
3179 
3180         final override MaxOperator getOperator()
3181         {
3182             return this.outer;
3183         }
3184 
3185         final override void processNextField(const char[] nextField)
3186         {
3187             double fieldValue = nextField.to!double;
3188             if (_isFirst)
3189             {
3190                 _value = fieldValue;
3191                 _originalString = nextField.to!string;
3192                 _isFirst = false;
3193             }
3194             else if (fieldValue > _value)
3195             {
3196                 _value = fieldValue;
3197                 _originalString = nextField.to!string;
3198             }
3199         }
3200 
3201         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3202         {
3203             return _originalString;
3204         }
3205     }
3206 }
3207 
3208 unittest // MaxOperator
3209 {
3210     auto col1File = [["10"], ["9.5"], ["11"]];
3211     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3212     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
3213 
3214     testSingleFieldOperator!MaxOperator(col1File, 0, "max", ["nan", "10", "10", "11"]);
3215     testSingleFieldOperator!MaxOperator(col2File, 0, "max", ["nan", "20", "21", "22"]);
3216     testSingleFieldOperator!MaxOperator(col2File, 1, "max", ["nan", "-30", "-29", "-29"]);
3217     testSingleFieldOperator!MaxOperator(col3File, 0, "max", ["nan", "9009", "9009", "9009"]);
3218     testSingleFieldOperator!MaxOperator(col3File, 1, "max", ["nan", "9", "9", "9"]);
3219     testSingleFieldOperator!MaxOperator(col3File, 2, "max", ["nan", "-4.5", "-0.5", "12"]);
3220 
3221     auto col1misFile = [[""], ["-10"], ["10"]];
3222     testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "nan", "-10", "10"],
3223                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3224     testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "5", "5", "10"],
3225                                           new MissingFieldPolicy(false, "5"));  // Replace missing
3226 }
3227 
3228 /** RangeOperator outputs the difference between the minimum and maximum values.
3229  *
3230  * If there is a single value, or all values are the same, the range is zero. This is
3231  * a numeric operator.
3232  */
3233 final class RangeOperator : SingleFieldOperator
3234 {
3235     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3236     {
3237         super("range", fieldIndex, missingPolicy);
3238     }
3239 
3240     final override SingleFieldCalculator makeCalculator()
3241     {
3242         return new RangeCalculator(fieldIndex);
3243     }
3244 
3245     final class RangeCalculator : SingleFieldCalculator
3246     {
3247         private bool _isFirst = true;
3248         private double _minValue = 0.0;
3249         private double _maxValue = 0.0;
3250 
3251         this(size_t fieldIndex)
3252         {
3253             super(fieldIndex);
3254         }
3255 
3256         final override RangeOperator getOperator()
3257         {
3258             return this.outer;
3259         }
3260 
3261         final override void processNextField(const char[] nextField)
3262         {
3263             double fieldValue = nextField.to!double;
3264             if (_isFirst)
3265             {
3266                 _minValue = _maxValue = fieldValue;
3267                 _isFirst = false;
3268             }
3269             else if (fieldValue > _maxValue)
3270             {
3271                 _maxValue = fieldValue;
3272             }
3273             else if (fieldValue < _minValue)
3274             {
3275                 _minValue = fieldValue;
3276             }
3277         }
3278 
3279         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3280         {
3281             return printOptions.formatNumber(_maxValue - _minValue);
3282         }
3283     }
3284 }
3285 
3286 unittest // RangeOperator
3287 {
3288     auto col1File = [["10"], ["9.5"], ["11"]];
3289     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3290     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
3291 
3292     testSingleFieldOperator!RangeOperator(col1File, 0, "range", ["0", "0", "0.5", "1.5"]);
3293     testSingleFieldOperator!RangeOperator(col2File, 0, "range", ["0", "0", "1", "2"]);
3294     testSingleFieldOperator!RangeOperator(col2File, 1, "range", ["0", "0", "1", "2"]);
3295     testSingleFieldOperator!RangeOperator(col3File, 0, "range", ["0", "0", "8810", "8810"]);
3296     testSingleFieldOperator!RangeOperator(col3File, 1, "range", ["0", "0", "9", "9"]);
3297     testSingleFieldOperator!RangeOperator(col3File, 2, "range", ["0", "0", "4", "16.5"]);
3298 
3299     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]];
3300     testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "0", "0", "0.5", "1.5"],
3301                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3302     testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "4.5", "4.5", "4.5", "5.5"],
3303                                           new MissingFieldPolicy(false, "5.5"));  // Replace missing
3304 }
3305 
3306 /** SumOperator produces the sum of all the values. This is a numeric operator.
3307  */
3308 final class SumOperator : SingleFieldOperator
3309 {
3310     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3311     {
3312         super("sum", fieldIndex, missingPolicy);
3313     }
3314 
3315     final override SingleFieldCalculator makeCalculator()
3316     {
3317         return new SumCalculator(fieldIndex);
3318     }
3319 
3320     final class SumCalculator : SingleFieldCalculator
3321     {
3322         private double _total = 0.0;
3323 
3324         this(size_t fieldIndex)
3325         {
3326             super(fieldIndex);
3327         }
3328 
3329         final override SumOperator getOperator()
3330         {
3331             return this.outer;
3332         }
3333 
3334         final override void processNextField(const char[] nextField)
3335         {
3336             _total += nextField.to!double;
3337         }
3338 
3339         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3340         {
3341             return printOptions.formatNumber(_total);
3342         }
3343     }
3344 }
3345 
3346 unittest // SumOperator
3347 {
3348     auto col1File = [["10"], ["9.5"], ["11"]];
3349     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3350     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
3351 
3352     testSingleFieldOperator!SumOperator(col1File, 0, "sum", ["0", "10", "19.5", "30.5"]);
3353     testSingleFieldOperator!SumOperator(col2File, 0, "sum", ["0", "20", "41", "63"]);
3354     testSingleFieldOperator!SumOperator(col2File, 1, "sum", ["0", "-30", "-59", "-90"]);
3355     testSingleFieldOperator!SumOperator(col3File, 0, "sum", ["0", "9009", "9208", "12211"]);
3356     testSingleFieldOperator!SumOperator(col3File, 1, "sum", ["0", "9", "9", "9.2"]);
3357     testSingleFieldOperator!SumOperator(col3File, 2, "sum", ["0", "-4.5", "-5", "7"]);
3358 
3359     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]];
3360     testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "0", "10", "10", "19.5", "30.5"],
3361                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3362     testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "1.5", "11.5", "13", "22.5", "33.5"],
3363                                           new MissingFieldPolicy(false, "1.5"));  // Replace missing
3364 }
3365 
3366 /** MeanOperator produces the mean (average) of all the values. This is a numeric operator.
3367  */
3368 final class MeanOperator : SingleFieldOperator
3369 {
3370     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3371     {
3372         super("mean", fieldIndex, missingPolicy);
3373     }
3374 
3375     final override SingleFieldCalculator makeCalculator()
3376     {
3377         return new MeanCalculator(fieldIndex);
3378     }
3379 
3380     final class MeanCalculator : SingleFieldCalculator
3381     {
3382         private double _total = 0.0;
3383         private size_t _count = 0;
3384 
3385         this(size_t fieldIndex)
3386         {
3387             super(fieldIndex);
3388         }
3389 
3390         final override MeanOperator getOperator()
3391         {
3392             return this.outer;
3393         }
3394 
3395         final override void processNextField(const char[] nextField)
3396         {
3397             _total += nextField.to!double;
3398             _count++;
3399         }
3400 
3401         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3402         {
3403             return printOptions.formatNumber(
3404                 (_count > 0) ? (_total / _count.to!double) : double.nan);
3405         }
3406     }
3407 }
3408 
3409 unittest // MeanOperator
3410 {
3411     auto col1File = [["10"], ["9.5"], ["7.5"]];
3412     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3413     auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]];
3414 
3415     testSingleFieldOperator!MeanOperator(col1File, 0, "mean", ["nan", "10", "9.75", "9"]);
3416     testSingleFieldOperator!MeanOperator(col2File, 0, "mean", ["nan", "20", "20.5", "21"]);
3417     testSingleFieldOperator!MeanOperator(col2File, 1, "mean", ["nan", "-30", "-29.5", "-30"]);
3418     testSingleFieldOperator!MeanOperator(col3File, 0, "mean", ["nan", "9009", "4509", "4509"]);
3419     testSingleFieldOperator!MeanOperator(col3File, 1, "mean", ["nan", "9", "4.5", "2"]);
3420     testSingleFieldOperator!MeanOperator(col3File, 2, "mean", ["nan", "-4.5", "-3", "2"]);
3421 
3422     auto col1misFile = [[""], ["6"], [""], ["14"], ["40"]];
3423     testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "nan", "6", "6", "10", "20"],
3424                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3425     testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "0", "3", "2", "5", "12"],
3426                                           new MissingFieldPolicy(false, "0"));  // Replace missing
3427 }
3428 
3429 /** MedianOperator produces the median of all the values. This is a numeric operator.
3430  *
3431  * All the field values are stored in memory as part of this calculation. This is
3432  * handled by unique key value lists.
3433  */
3434 final class MedianOperator : SingleFieldOperator
3435 {
3436     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3437     {
3438         super("median", fieldIndex, missingPolicy);
3439         setSaveFieldValuesNumeric();
3440     }
3441 
3442     final override SingleFieldCalculator makeCalculator()
3443     {
3444         return new MedianCalculator(fieldIndex);
3445     }
3446 
3447     final class MedianCalculator : SingleFieldCalculator
3448     {
3449         this(size_t fieldIndex)
3450         {
3451             super(fieldIndex);
3452         }
3453 
3454         final override MedianOperator getOperator()
3455         {
3456             return this.outer;
3457         }
3458 
3459         /* Work is done by saving the field values. */
3460         final override void processNextField(const char[] nextField)
3461         { }
3462 
3463         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3464         {
3465             return printOptions.formatNumber(valuesLists.numericValuesMedian(fieldIndex));
3466         }
3467     }
3468 }
3469 
3470 unittest // MedianOperator
3471 {
3472     auto col1File = [["10"], ["9.5"], ["7.5"]];
3473     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3474     auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]];
3475 
3476     testSingleFieldOperator!MedianOperator(col1File, 0, "median", ["nan", "10", "9.75", "9.5"]);
3477     testSingleFieldOperator!MedianOperator(col2File, 0, "median", ["nan", "20", "20.5", "21"]);
3478     testSingleFieldOperator!MedianOperator(col2File, 1, "median", ["nan", "-30", "-29.5", "-30"]);
3479     testSingleFieldOperator!MedianOperator(col3File, 0, "median", ["nan", "9009", "4509", "4509"]);
3480     testSingleFieldOperator!MedianOperator(col3File, 1, "median", ["nan", "9", "4.5", "0"]);
3481     testSingleFieldOperator!MedianOperator(col3File, 2, "median", ["nan", "-4.5", "-3", "-1.5"]);
3482 
3483     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]];
3484     testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "nan", "10", "10", "9.75", "9.5"],
3485                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3486     testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "0", "5", "0", "4.75", "7.5"],
3487                                           new MissingFieldPolicy(false, "0"));  // Replace missing
3488 }
3489 
3490 /** QuantileOperator produces the value representing the data at a cummulative probability.
3491  * This is a numeric operation.
3492  *
3493  * As an example, quantiles might be produced for the 0.25, 0.5, and 0.75 probabilities
3494  * (alternately, the 25th, 50th, and 75th percentile ranks, the 50th percentile being the
3495  * median). Data is sorted is ascending order. This operator takes one percentile, but it
3496  * is common to generate multiple quantile ranks for the same field when summarizing.
3497  *
3498  * All the field's values are stored in memory as part of this calculation. This is
3499  * handled by unique key value lists.
3500  */
3501 final class QuantileOperator : SingleFieldOperator
3502 {
3503     private double _prob;
3504 
3505     this(size_t fieldIndex, MissingFieldPolicy missingPolicy, double probability)
3506     {
3507         assert(0.0 <= probability && probability <= 1.0);
3508         import std.format : format;
3509 
3510         string header = (probability == 0.0) ? "pct0" : format("pct%02g", probability * 100.0);
3511         super(header, fieldIndex, missingPolicy);
3512         _prob = probability;
3513         setSaveFieldValuesNumeric();
3514     }
3515 
3516     final override SingleFieldCalculator makeCalculator()
3517     {
3518         return new QuantileCalculator(fieldIndex);
3519     }
3520 
3521     final class QuantileCalculator : SingleFieldCalculator
3522     {
3523         this(size_t fieldIndex)
3524         {
3525             super(fieldIndex);
3526         }
3527 
3528         final override QuantileOperator getOperator()
3529         {
3530             return this.outer;
3531         }
3532 
3533         /* Work is done by saving the field values. */
3534         final override void processNextField(const char[] nextField)
3535         { }
3536 
3537         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3538         {
3539             import tsv_utils.common.numerics : quantile;
3540             return printOptions.formatNumber(
3541                 quantile(this.outer._prob, valuesLists.numericValuesSorted(fieldIndex)));
3542         }
3543     }
3544 }
3545 
3546 unittest // QuantileOperator
3547 {
3548     auto col1File = [["10"], ["9.5"], ["7.5"]];
3549     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3550     auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]];
3551 
3552     auto defaultMissing = new MissingFieldPolicy;
3553 
3554     /* Same as the median tests. */
3555     testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct50", ["nan", "10", "9.75", "9.5"], defaultMissing, 0.50);
3556     testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct50", ["nan", "20", "20.5", "21"], defaultMissing, 0.50);
3557     testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct50", ["nan", "-30", "-29.5", "-30"], defaultMissing, 0.50);
3558     testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct50", ["nan", "9009", "4509", "4509"], defaultMissing, 0.50);
3559     testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct50", ["nan", "9", "4.5", "0"], defaultMissing, 0.50);
3560     testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct50", ["nan", "-4.5", "-3", "-1.5"], defaultMissing, 0.50);
3561 
3562     /* The extremes (0, 1), are min and max. */
3563     testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct0", ["nan", "10", "9.5", "7.5"], defaultMissing, 0.0);
3564     testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct0", ["nan", "20", "20", "20"], defaultMissing, 0.0);
3565     testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct0", ["nan", "-30", "-30", "-31"], defaultMissing, 0.0);
3566     testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct0", ["nan", "9009", "9", "9"], defaultMissing, 0.0);
3567     testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct0", ["nan", "9", "0", "-3"], defaultMissing, 0.0);
3568     testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct0", ["nan", "-4.5", "-4.5", "-4.5"], defaultMissing, 0.0);
3569 
3570     testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct100", ["nan", "10", "10", "10"], defaultMissing, 1.0);
3571     testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct100", ["nan", "20", "21", "22"], defaultMissing, 1.0);
3572     testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct100", ["nan", "-30", "-29", "-29"], defaultMissing, 1.0);
3573     testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct100", ["nan", "9009", "9009", "9009"], defaultMissing, 1.0);
3574     testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct100", ["nan", "9", "9", "9"], defaultMissing, 1.0);
3575     testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct100", ["nan", "-4.5", "-1.5", "12"], defaultMissing, 1.0);
3576 
3577     /* For missing policies, re-use the median tests. */
3578     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]];
3579     testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "nan", "10", "10", "9.75", "9.5"],
3580                                                  new MissingFieldPolicy(true, ""), 0.5);  // Exclude missing
3581     testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "0", "5", "0", "4.75", "7.5"],
3582                                                  new MissingFieldPolicy(false, "0"), 0.5);  // Replace missing
3583 }
3584 
3585 /** MadOperator produces the median absolute deviation from the median. This is a numeric
3586  * operation.
3587  *
3588  * The result is the raw MAD value, without a normalization applied.
3589  *
3590  * All the field values are stored in memory as part of this calculation. This is
3591  * handled by unique key value lists.
3592  */
3593 final class MadOperator : SingleFieldOperator
3594 {
3595     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3596     {
3597         super("mad", fieldIndex, missingPolicy);
3598         setSaveFieldValuesNumeric();
3599     }
3600 
3601     final override SingleFieldCalculator makeCalculator()
3602     {
3603         return new MadCalculator(fieldIndex);
3604     }
3605 
3606     final class MadCalculator : SingleFieldCalculator
3607     {
3608         this(size_t fieldIndex)
3609         {
3610             super(fieldIndex);
3611         }
3612 
3613         final override MadOperator getOperator()
3614         {
3615             return this.outer;
3616         }
3617 
3618         /* Work is done by saving the field values. */
3619         final override void processNextField(const char[] nextField)
3620         { }
3621 
3622         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3623         {
3624             import std.math : abs;
3625             import tsv_utils.common.numerics : rangeMedian;
3626 
3627             auto median = valuesLists.numericValuesMedian(fieldIndex);
3628             auto values = valuesLists.numericValues(fieldIndex);
3629             auto medianDevs = new double[values.length];
3630             foreach (size_t i, double v; values)
3631                 medianDevs[i] = abs(v - median);
3632 
3633             return printOptions.formatNumber(medianDevs.rangeMedian);
3634         }
3635     }
3636 }
3637 
3638 unittest // MadOperator
3639 {
3640     auto col1File = [["10"], ["15"], ["20"], ["25"], ["30"]];
3641     auto col2File = [["2", "50"], ["2", "51"], ["2", "52"]];
3642     auto col3File = [["16", "8", "-4"], ["8", "8", "-2"], ["8", "16", "0"]];
3643 
3644     testSingleFieldOperator!MadOperator(col1File, 0, "mad", ["nan", "0", "2.5", "5", "5", "5"]);
3645     testSingleFieldOperator!MadOperator(col2File, 0, "mad", ["nan", "0", "0", "0"]);
3646     testSingleFieldOperator!MadOperator(col2File, 1, "mad", ["nan", "0", "0.5", "1"]);
3647     testSingleFieldOperator!MadOperator(col3File, 0, "mad", ["nan", "0", "4", "0"]);
3648     testSingleFieldOperator!MadOperator(col3File, 1, "mad", ["nan", "0", "0", "0"]);
3649     testSingleFieldOperator!MadOperator(col3File, 2, "mad", ["nan", "0", "1", "2"]);
3650 
3651     auto col1misFile = [[""], ["16"], [""], ["32"], ["-4"]];
3652     testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "nan", "0", "0", "8", "16"],
3653                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3654     testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "0", "8", "0", "8", "4"],
3655                                           new MissingFieldPolicy(false, "0"));  // Replace missing
3656 }
3657 
3658 /** Generates the variance of the fields values. This is a numeric operator.
3659  */
3660 final class VarianceOperator : SingleFieldOperator
3661 {
3662     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3663     {
3664         super("var", fieldIndex, missingPolicy);
3665     }
3666 
3667     final override SingleFieldCalculator makeCalculator()
3668     {
3669         return new VarianceCalculator(fieldIndex);
3670     }
3671 
3672     final class VarianceCalculator : SingleFieldCalculator
3673     {
3674         private double _count = 0.0;
3675         private double _mean = 0.0;
3676         private double _m2 = 0.0;     // Sum of squares of differences from current mean
3677 
3678         this(size_t fieldIndex)
3679         {
3680             super(fieldIndex);
3681         }
3682 
3683         final override VarianceOperator getOperator()
3684         {
3685             return this.outer;
3686         }
3687 
3688         final override void processNextField(const char[] nextField)
3689         {
3690             _count += 1.0;
3691             double fieldValue = nextField.to!double;
3692             double delta = fieldValue - _mean;
3693             _mean += delta / _count;
3694             _m2 += delta * (fieldValue - _mean);
3695         }
3696 
3697         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3698         {
3699             return printOptions.formatNumber(
3700                 (_count >= 2.0) ? (_m2 / (_count - 1.0)) : double.nan);
3701         }
3702     }
3703 }
3704 
3705 unittest // VarianceOperator
3706 {
3707     auto col1File = [["5"], ["10"], ["15"]];
3708     auto col2File = [["-5", "-5"], ["-10", "0"], ["-15", "5"]];
3709     auto col3File = [["1", "2", "100"], ["2", "3", "100"], ["3", "4", "103"]];
3710 
3711     testSingleFieldOperator!VarianceOperator(col1File, 0, "var", ["nan", "nan", "12.5", "25"]);
3712     testSingleFieldOperator!VarianceOperator(col2File, 0, "var", ["nan", "nan", "12.5", "25"]);
3713     testSingleFieldOperator!VarianceOperator(col2File, 1, "var", ["nan", "nan", "12.5", "25"]);
3714     testSingleFieldOperator!VarianceOperator(col3File, 0, "var", ["nan", "nan", "0.5", "1"]);
3715     testSingleFieldOperator!VarianceOperator(col3File, 1, "var", ["nan", "nan", "0.5", "1"]);
3716     testSingleFieldOperator!VarianceOperator(col3File, 2, "var", ["nan", "nan", "0", "3"]);
3717 
3718     auto col1misFile = [["5"], ["10"], [""]];
3719     testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "12.5"],
3720                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3721     testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "25"],
3722                                           new MissingFieldPolicy(false, "15"));  // Replace missing
3723 }
3724 
3725 /** Generates the standard deviation of the fields values. This is a numeric operator.
3726  */
3727 final class StDevOperator : SingleFieldOperator
3728 {
3729     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3730     {
3731         super("stdev", fieldIndex, missingPolicy);
3732     }
3733 
3734     final override SingleFieldCalculator makeCalculator()
3735     {
3736         return new StDevCalculator(fieldIndex);
3737     }
3738 
3739     final class StDevCalculator : SingleFieldCalculator
3740     {
3741         private double _count = 0.0;
3742         private double _mean = 0.0;
3743         private double _m2 = 0.0;     // Sum of squares of differences from current mean
3744 
3745         this(size_t fieldIndex)
3746         {
3747             super(fieldIndex);
3748         }
3749 
3750         final override StDevOperator getOperator()
3751         {
3752             return this.outer;
3753         }
3754 
3755         final override void processNextField(const char[] nextField)
3756         {
3757             _count += 1.0;
3758             double fieldValue = nextField.to!double;
3759             double delta = fieldValue - _mean;
3760             _mean += delta / _count;
3761             _m2 += delta * (fieldValue - _mean);
3762         }
3763 
3764         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3765         {
3766             import std.math : sqrt;
3767             return printOptions.formatNumber(
3768                 (_count >= 2.0) ? (_m2 / (_count - 1.0)).sqrt : double.nan);
3769         }
3770     }
3771 }
3772 
3773 /* StDevOperator unit tests - These would be improved with a tolerance option.
3774  */
3775 unittest
3776 {
3777     auto col1File = [["1"], ["4"], ["7"]];
3778     auto col2File = [["3", "3"], ["3", "9"], ["7", "15"]];
3779     auto col3File = [["11", "10", "10"], ["24", "22", "25"], ["37", "34", "40"]];
3780 
3781     testSingleFieldOperator!StDevOperator(col1File, 0, "stdev", ["nan", "nan", "2.12132034356", "3"]);
3782     testSingleFieldOperator!StDevOperator(col2File, 0, "stdev", ["nan", "nan", "0", "2.30940107676"]);
3783     testSingleFieldOperator!StDevOperator(col2File, 1, "stdev", ["nan", "nan", "4.24264068712", "6"]);
3784     testSingleFieldOperator!StDevOperator(col3File, 0, "stdev", ["nan", "nan", "9.19238815543", "13"]);
3785     testSingleFieldOperator!StDevOperator(col3File, 1, "stdev", ["nan", "nan", "8.48528137424", "12"]);
3786     testSingleFieldOperator!StDevOperator(col3File, 2, "stdev", ["nan", "nan", "10.6066017178", "15"]);
3787 
3788     auto col1misFile = [["1"], ["4"], [""]];
3789     testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "2.12132034356"],
3790                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3791     testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "3"],
3792                                           new MissingFieldPolicy(false, "7"));  // Replace missing
3793 }
3794 
3795 /** UniqueCountOperator generates the number of unique values. Unique values are
3796  * based on exact text match calculation, not a numeric comparison.
3797  *
3798  * All the unique field values are stored in memory as part of this calculation.
3799  */
3800 final class UniqueCountOperator : SingleFieldOperator
3801 {
3802     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3803     {
3804         super("unique_count", fieldIndex, missingPolicy);
3805     }
3806 
3807     final override SingleFieldCalculator makeCalculator()
3808     {
3809         return new UniqueCountCalculator(fieldIndex);
3810     }
3811 
3812     final class UniqueCountCalculator : SingleFieldCalculator
3813     {
3814         private bool[string] _values;
3815 
3816         this(size_t fieldIndex)
3817         {
3818             super(fieldIndex);
3819         }
3820 
3821         final override UniqueCountOperator getOperator()
3822         {
3823             return this.outer;
3824         }
3825 
3826         final override void processNextField(const char[] nextField)
3827         {
3828             if (nextField !in _values) _values[nextField.to!string] = true;
3829         }
3830 
3831         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3832         {
3833             return printOptions.formatNumber(_values.length);
3834         }
3835     }
3836 }
3837 
3838 unittest // UniqueCount
3839 {
3840     auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]];
3841     auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]];
3842     auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]];
3843 
3844     testSingleFieldOperator!UniqueCountOperator(col1File, 0, "unique_count", ["0", "1", "2", "3", "3", "3", "3", "3", "4"]);
3845     testSingleFieldOperator!UniqueCountOperator(col2File, 0, "unique_count", ["0", "1", "2", "2"]);
3846     testSingleFieldOperator!UniqueCountOperator(col2File, 1, "unique_count", ["0", "1", "1", "2"]);
3847     testSingleFieldOperator!UniqueCountOperator(col3File, 0, "unique_count", ["0", "1", "2", "3"]);
3848     testSingleFieldOperator!UniqueCountOperator(col3File, 1, "unique_count", ["0", "1", "2", "2"]);
3849     testSingleFieldOperator!UniqueCountOperator(col3File, 2, "unique_count", ["0", "1", "2", "3"]);
3850 
3851     auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]];
3852     testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "0", "1", "1", "2", "3", "3", "3", "3", "3", "4"],
3853                                                 new MissingFieldPolicy(true, ""));  // Exclude missing
3854 
3855 
3856     testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "1", "2", "2", "3", "4", "4", "4", "4", "4", "5"],
3857                                                 new MissingFieldPolicy(false, "XYZ"));  // Replace missing
3858 }
3859 
3860 /** MissingCountOperator generates the number of missing values. This overrides
3861  * the global missingFieldsPolicy.
3862  */
3863 final class MissingCountOperator : SingleFieldOperator
3864 {
3865     private MissingFieldPolicy _globalMissingPolicy;
3866 
3867     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3868     {
3869         _globalMissingPolicy = missingPolicy;
3870         super("missing_count", fieldIndex, new MissingFieldPolicy(false, ""));
3871     }
3872 
3873     final override SingleFieldCalculator makeCalculator()
3874     {
3875         return new MissingCountCalculator(fieldIndex);
3876     }
3877 
3878     final class MissingCountCalculator : SingleFieldCalculator
3879     {
3880         private size_t _missingCount = 0;
3881 
3882         this(size_t fieldIndex)
3883         {
3884             super(fieldIndex);
3885         }
3886 
3887         final override MissingCountOperator getOperator()
3888         {
3889             return this.outer;
3890         }
3891 
3892         final override void processNextField(const char[] nextField)
3893         {
3894             if (this.outer._globalMissingPolicy.isMissingField(nextField)) _missingCount++;
3895         }
3896 
3897         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3898         {
3899             return printOptions.formatNumber(_missingCount);
3900         }
3901     }
3902 }
3903 
3904 unittest // MissingCount
3905 {
3906     auto col1File = [["a"], ["b"], [""], [" "], [""]];
3907     auto col2File = [["abc", ""], ["", ""], ["def", ""]];
3908     auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]];
3909 
3910     testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"]);
3911     testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"]);
3912     testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"]);
3913     testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"]);
3914     testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"]);
3915     testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"]);
3916 
3917     auto excludeMissing = new MissingFieldPolicy(true, "");
3918     auto replaceMissing = new MissingFieldPolicy(false, "X");
3919 
3920     testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], excludeMissing);
3921     testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], excludeMissing);
3922     testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], excludeMissing);
3923     testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], excludeMissing);
3924     testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], excludeMissing);
3925     testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], excludeMissing);
3926 
3927     testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], replaceMissing);
3928     testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], replaceMissing);
3929     testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], replaceMissing);
3930     testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], replaceMissing);
3931     testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], replaceMissing);
3932     testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], replaceMissing);
3933 }
3934 
3935 /** NotMissingCountOperator generates the number of not-missing values. This overrides
3936  * the global missingFieldsPolicy.
3937  */
3938 final class NotMissingCountOperator : SingleFieldOperator
3939 {
3940     private MissingFieldPolicy _globalMissingPolicy;
3941 
3942     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3943     {
3944         _globalMissingPolicy = missingPolicy;
3945         super("not_missing_count", fieldIndex, new MissingFieldPolicy(false, ""));
3946     }
3947 
3948     final override SingleFieldCalculator makeCalculator()
3949     {
3950         return new NotMissingCountCalculator(fieldIndex);
3951     }
3952 
3953     final class NotMissingCountCalculator : SingleFieldCalculator
3954     {
3955         private size_t _notMissingCount = 0;
3956 
3957         this(size_t fieldIndex)
3958         {
3959             super(fieldIndex);
3960         }
3961 
3962         final override NotMissingCountOperator getOperator()
3963         {
3964             return this.outer;
3965         }
3966 
3967         final override void processNextField(const char[] nextField)
3968         {
3969             if (!this.outer._globalMissingPolicy.isMissingField(nextField)) _notMissingCount++;
3970         }
3971 
3972         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3973         {
3974             return printOptions.formatNumber(_notMissingCount);
3975         }
3976     }
3977 }
3978 
3979 unittest // NotMissingCount
3980 {
3981     auto col1File = [["a"], ["b"], [""], [" "], [""]];
3982     auto col2File = [["abc", ""], ["", ""], ["def", ""]];
3983     auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]];
3984 
3985     testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"]);
3986     testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"]);
3987     testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"]);
3988     testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"]);
3989     testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"]);
3990     testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"]);
3991 
3992     auto excludeMissing = new MissingFieldPolicy(true, "");
3993     auto replaceMissing = new MissingFieldPolicy(false, "X");
3994 
3995     testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], excludeMissing);
3996     testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], excludeMissing);
3997     testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], excludeMissing);
3998     testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], excludeMissing);
3999     testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], excludeMissing);
4000     testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], excludeMissing);
4001 
4002     testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], replaceMissing);
4003     testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], replaceMissing);
4004     testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], replaceMissing);
4005     testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], replaceMissing);
4006     testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], replaceMissing);
4007     testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], replaceMissing);
4008 }
4009 
4010 /** ModeOperator outputs the most frequent value seen. In the event of a tie, the
4011  * first value seen is produced.
4012  *
4013  * All the field values are stored in memory as part of this calculation.
4014  *
4015  */
4016 final class ModeOperator : SingleFieldOperator
4017 {
4018     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
4019     {
4020         super("mode", fieldIndex, missingPolicy);
4021     }
4022 
4023     final override SingleFieldCalculator makeCalculator()
4024     {
4025         return new ModeCalculator(fieldIndex);
4026     }
4027 
4028     final class ModeCalculator : SingleFieldCalculator
4029     {
4030         private size_t[string] _valueCounts;
4031         private Appender!(string[]) _uniqueValues;
4032 
4033         this(size_t fieldIndex)
4034         {
4035             super(fieldIndex);
4036         }
4037 
4038         final override ModeOperator getOperator()
4039         {
4040             return this.outer;
4041         }
4042 
4043         final override void processNextField(const char[] nextField)
4044         {
4045             auto countPtr = (nextField in _valueCounts);
4046 
4047             if (countPtr is null)
4048             {
4049                 string value = nextField.to!string;
4050                 _uniqueValues.put(value);
4051                 _valueCounts[value] = 1;
4052             }
4053             else
4054             {
4055                 (*countPtr)++;
4056             }
4057         }
4058 
4059         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
4060         {
4061             string modeValue = "";
4062             size_t modeCount = 0;
4063 
4064             foreach (value; _uniqueValues.data)
4065             {
4066                 assert(value in _valueCounts);
4067 
4068                 auto count = _valueCounts[value];
4069 
4070                 if (count > modeCount)
4071                 {
4072                     modeValue = value;
4073                     modeCount = count;
4074                 }
4075             }
4076 
4077             return modeValue;
4078         }
4079     }
4080 }
4081 
4082 unittest // ModeOperator
4083 {
4084     auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]];
4085     auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]];
4086     auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]];
4087 
4088     testSingleFieldOperator!ModeOperator(col1File, 0, "mode", ["", "a", "a", "a", "c", "b", "b", "b"]);
4089     testSingleFieldOperator!ModeOperator(col2File, 0, "mode", ["", "abc", "abc", "def"]);
4090     testSingleFieldOperator!ModeOperator(col2File, 1, "mode", ["", "pqr", "pqr", "pqr"]);
4091     testSingleFieldOperator!ModeOperator(col3File, 0, "mode", ["", "1.0", "1.0", "1.0"]);
4092     testSingleFieldOperator!ModeOperator(col3File, 1, "mode", ["", "1", "1", "a"]);
4093     testSingleFieldOperator!ModeOperator(col3File, 2, "mode", ["", "a", "a", "a"]);
4094 
4095     auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]];
4096     testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "", "a", "a", "a", "a", "c", "b", "b"],
4097                                          new MissingFieldPolicy(true, ""));  // Exclude missing
4098 
4099 
4100     testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "X", "X", "X", "X", "X", "X", "X", "b"],
4101                                          new MissingFieldPolicy(false, "X"));  // Replace missing
4102 }
4103 
4104 /** ModeCountOperator outputs the count of the most frequent value seen.
4105  *
4106  * All the field values are stored in memory as part of this calculation.
4107  *
4108  */
4109 final class ModeCountOperator : SingleFieldOperator
4110 {
4111     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
4112     {
4113         super("mode_count", fieldIndex, missingPolicy);
4114     }
4115 
4116     final override SingleFieldCalculator makeCalculator()
4117     {
4118         return new ModeCountCalculator(fieldIndex);
4119     }
4120 
4121     final class ModeCountCalculator : SingleFieldCalculator
4122     {
4123         private size_t[string] _valueCounts;
4124 
4125         this(size_t fieldIndex)
4126         {
4127             super(fieldIndex);
4128         }
4129 
4130         final override ModeCountOperator getOperator()
4131         {
4132             return this.outer;
4133         }
4134 
4135         final override void processNextField(const char[] nextField)
4136         {
4137             auto countPtr = (nextField in _valueCounts);
4138 
4139             if (countPtr is null)
4140             {
4141                 string value = nextField.to!string;
4142                 _valueCounts[value] = 1;
4143             }
4144             else
4145             {
4146                 (*countPtr)++;
4147             }
4148         }
4149 
4150         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
4151         {
4152             size_t modeCount = 0;
4153             foreach (count; _valueCounts.byValue) if (count > modeCount) modeCount = count;
4154             return printOptions.formatNumber(modeCount);
4155         }
4156     }
4157 }
4158 
4159 unittest // ModeCountOperator
4160 {
4161     auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]];
4162     auto col2File = [["abc", ""], ["def", ""], ["def", "xyz"]];
4163     auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]];
4164 
4165     testSingleFieldOperator!ModeCountOperator(col1File, 0, "mode_count", ["0", "1", "1", "1", "2", "2", "3", "3"]);
4166     testSingleFieldOperator!ModeCountOperator(col2File, 0, "mode_count", ["0", "1", "1", "2"]);
4167     testSingleFieldOperator!ModeCountOperator(col2File, 1, "mode_count", ["0", "1", "2", "2"]);
4168     testSingleFieldOperator!ModeCountOperator(col3File, 0, "mode_count", ["0", "1", "1", "1"]);
4169     testSingleFieldOperator!ModeCountOperator(col3File, 1, "mode_count", ["0", "1", "1", "2"]);
4170     testSingleFieldOperator!ModeCountOperator(col3File, 2, "mode_count", ["0", "1", "1", "1"]);
4171 
4172     auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]];
4173     testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "0", "1", "1", "1", "1", "2", "2", "3"],
4174                                               new MissingFieldPolicy(true, ""));  // Exclude missing
4175 
4176 
4177     testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "1", "1", "2", "2", "2", "2", "2", "3"],
4178                                               new MissingFieldPolicy(false, "X"));  // Replace missing
4179 }
4180 
4181 /** ValuesOperator outputs each value delimited by an alternate delimiter character.
4182  *
4183  * All the field values are stored in memory as part of this calculation. This is
4184  * handled by unique key value lists.
4185  */
4186 
4187 final class ValuesOperator : SingleFieldOperator
4188 {
4189     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
4190     {
4191         super("values", fieldIndex, missingPolicy);
4192         setSaveFieldValuesText();
4193     }
4194 
4195     final override SingleFieldCalculator makeCalculator()
4196     {
4197         return new ValuesCalculator(fieldIndex);
4198     }
4199 
4200     final class ValuesCalculator : SingleFieldCalculator
4201     {
4202         this(size_t fieldIndex)
4203         {
4204             super(fieldIndex);
4205         }
4206 
4207         final override ValuesOperator getOperator()
4208         {
4209             return this.outer;
4210         }
4211 
4212         /* Work is done by saving the field values. */
4213         final override void processNextField(const char[] nextField)
4214         { }
4215 
4216         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
4217         {
4218             return valuesLists.textValues(fieldIndex).join(printOptions.valuesDelimiter);
4219         }
4220     }
4221 }
4222 
4223 unittest // ValuesOperator
4224 {
4225     auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]];
4226     auto col2File = [["", "50"], ["", "51"], ["xyz", "52"]];
4227     auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "---"]];
4228 
4229     testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e|", "a||b|cd|e||a"]);
4230     testSingleFieldOperator!ValuesOperator(col2File, 0, "values", ["", "", "|", "||xyz"]);
4231     testSingleFieldOperator!ValuesOperator(col2File, 1, "values", ["", "50", "50|51", "50|51|52"]);
4232     testSingleFieldOperator!ValuesOperator(col3File, 0, "values", ["", "z", "z|y", "z|y|w"]);
4233     testSingleFieldOperator!ValuesOperator(col3File, 1, "values", ["", "a", "a|ab", "a|ab|ba"]);
4234     testSingleFieldOperator!ValuesOperator(col3File, 2, "values", ["", "-", "-|--", "-|--|---"]);
4235 
4236     testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e|a"],
4237                                          new MissingFieldPolicy(true, ""));  // Exclude missing
4238 
4239 
4240     testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e|X", "a|X|b|cd|e|X|a"],
4241                                          new MissingFieldPolicy(false, "X"));  // Replace missing
4242 }
4243 
4244 /** UniqueValuesOperator outputs each unique value delimited by an alternate delimiter
4245  * character. Values are output in the order seen.
4246  *
4247  * All unique field values are stored in memory as part of this calculation.
4248  *
4249  */
4250 final class UniqueValuesOperator : SingleFieldOperator
4251 {
4252     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
4253     {
4254         super("unique_values", fieldIndex, missingPolicy);
4255     }
4256 
4257     final override SingleFieldCalculator makeCalculator()
4258     {
4259         return new UniqueValuesCalculator(fieldIndex);
4260     }
4261 
4262     final class UniqueValuesCalculator : SingleFieldCalculator
4263     {
4264         private size_t[string] _valuesHash;
4265         private Appender!(string[]) _uniqueValues;
4266 
4267         this(size_t fieldIndex)
4268         {
4269             super(fieldIndex);
4270         }
4271 
4272         final override UniqueValuesOperator getOperator()
4273         {
4274             return this.outer;
4275         }
4276 
4277         final override void processNextField(const char[] nextField)
4278         {
4279             auto ptr = (nextField in _valuesHash);
4280 
4281             if (ptr is null)
4282             {
4283                 string value = nextField.to!string;
4284                 _uniqueValues.put(value);
4285                 _valuesHash[value] = 1;
4286             }
4287         }
4288 
4289         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
4290         {
4291             return _uniqueValues.data.join(printOptions.valuesDelimiter);
4292         }
4293     }
4294 }
4295 
4296 unittest // UniqueValuesOperator
4297 {
4298     auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]];
4299     auto col2File = [["", "50"], ["", "50"], ["xyz", "52"]];
4300     auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "-"]];
4301 
4302     testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e", "a||b|cd|e"]);
4303     testSingleFieldOperator!UniqueValuesOperator(col2File, 0, "unique_values", ["", "", "", "|xyz"]);
4304     testSingleFieldOperator!UniqueValuesOperator(col2File, 1, "unique_values", ["", "50", "50", "50|52"]);
4305     testSingleFieldOperator!UniqueValuesOperator(col3File, 0, "unique_values", ["", "z", "z|y", "z|y|w"]);
4306     testSingleFieldOperator!UniqueValuesOperator(col3File, 1, "unique_values", ["", "a", "a|ab", "a|ab|ba"]);
4307     testSingleFieldOperator!UniqueValuesOperator(col3File, 2, "unique_values", ["", "-", "-|--", "-|--"]);
4308 
4309     testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e"],
4310                                                  new MissingFieldPolicy(true, ""));  // Exclude missing
4311 
4312 
4313     testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e", "a|X|b|cd|e"],
4314                                                  new MissingFieldPolicy(false, "X"));  // Replace missing
4315 }