1 /**
2 Command line tool that reads TSV files and summarizes field values associated with
3 equivalent keys.
4 
5 Copyright (c) 2016-2020, eBay Inc.
6 Initially written by Jon Degenhardt
7 
8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
9 */
10 module tsv_utils.tsv_summarize;
11 
12 import std.algorithm : all, any, canFind, each, find, findSplit, map, joiner, splitter;
13 import std.array : join;
14 import std.conv : to;
15 import std.exception : enforce;
16 import std.format : format;
17 import std.range;
18 import std.stdio;
19 import std.typecons : tuple;
20 import std.container : DList;
21 
22 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
23 
24 version(unittest)
25 {
26     // When running unit tests, use main from -main compiler switch.
27 }
28 else
29 {
30     int main(string[] cmdArgs)
31     {
32         /* When running in DMD code coverage mode, turn on report merging. */
33         version(D_Coverage) version(DigitalMars)
34         {
35             import core.runtime : dmd_coverSetMerge;
36             dmd_coverSetMerge(true);
37         }
38 
39         TsvSummarizeOptions cmdopt;
40         auto r = cmdopt.processArgs(cmdArgs);
41         if (!r[0]) return r[1];
42         version(LDC_Profile)
43         {
44             import ldc.profile : resetAll;
45             resetAll();
46         }
47         try tsvSummarize(cmdopt);
48         catch (Exception exc)
49         {
50             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
51             return 1;
52         }
53         return 0;
54     }
55 }
56 
57 auto helpTextVerbose = q"EOS
58 Synopsis: tsv-summarize [options] file [file...]
59 
60 tsv-summarize reads tabular data files (tab-separated by default), tracks
61 field values for each unique key, and runs summarization algorithms. Consider
62 the file data.tsv:
63 
64   Make    Color   Time
65   ford    blue    131
66   chevy   green   124
67   ford    red     128
68   bmw     black   118
69   bmw     black   126
70   ford    blue    122
71 
72 The min and average times for each make is generated by the command:
73 
74   $ tsv-summarize --header --group-by Make --min Time --mean Time data.tsv
75 
76 This produces:
77 
78   Make   Time_min Time_mean
79   ford   122      127
80   chevy  124      124
81   bmw    118      122
82 
83 Using '--group-by Make,Color' will group by both 'Make' and 'Color'.
84 Omitting the '--group-by' entirely summarizes fields for the full file.
85 
86 The previous example uses field names to identify fields. Field numbers
87 can be used as well. The next two commands are equivalent:
88 
89   $ tsv-summarize -H --group-by Make,Color --min Time --mean Time data.tsv
90   $ tsv-summarize -H --group-by 1,2 --min 3 --mean 3 data.tsv
91 
92 The program tries to generate useful headers, but custom headers can be
93 specified. Example (using -g and -H shortcuts for --header and --group-by):
94 
95   $ tsv-summarize -H -g 1 --min 3:Fastest --mean 3:Average data.tsv
96 
97 Most operators take custom headers in a similarly way, generally following:
98 
99   --<operator-name> FIELD[:header]
100 
101 Operators can be specified multiple times. They can also take multiple
102 fields (though not when a custom header is specified). Examples:
103 
104   --median 2,3,4
105   --median 2-5,7-11
106   --median elapsed_time,system_time,user_time
107   --median '*_time'              # Wildcard. All fields ending in '_time'.
108 
109 The quantile operator requires one or more probabilities after the fields:
110 
111   --quantile run_time:0.25       # Quantile 1 of the 'run_time' field
112   --quantile 2:0.25              # Quantile 1 of field 2
113   --quantile 2-4:0.25,0.5,0.75   # Q1, Median, Q3 of fields 2, 3, 4
114 
115 Summarization operators available are:
116   count       range        mad            values
117   retain      sum          var            unique-values
118   first       mean         stddev         unique-count
119   last        median       mode           missing-count
120   min         quantile     mode-count     not-missing-count
121   max
122 
123 Calculated numeric values are printed to 12 significant digits by default.
124 This can be changed using the '--p|float-precision' option. If six or less
125 it sets the number of significant digits after the decimal point. If
126 greater than six it sets the total number of significant digits.
127 
128 Calculations hold onto the minimum data needed while reading data. A few
129 operations like median keep all data values in memory. These operations will
130 start to encounter performance issues as available memory becomes scarce. The
131 size that can be handled effectively is machine dependent, but often quite
132 large files can be handled.
133 
134 Operations requiring numeric entries will signal an error and terminate
135 processing if a non-numeric entry is found.
136 
137 Missing values are not treated specially by default, this can be changed
138 using the '--x|exclude-missing' or '--r|replace-missing' option. The former
139 turns off processing for missing values, the latter uses a replacement value.
140 
141 Options:
142 EOS";
143 
144 auto helpText = q"EOS
145 Synopsis: tsv-summarize [options] file [file...]
146 
147 tsv-summarize runs aggregation operations on fields in tab-separated value
148 files. Operations can be run against the full input data or grouped by key
149 fields. Fields can be specified either by field number or field name. Use
150 '--help-verbose' for more detailed help.
151 
152 Options:
153 EOS";
154 
155 /** Command line options - Container and processing. The processArgs method is used to
156  * process the command line.
157  */
158 struct TsvSummarizeOptions {
159     import tsv_utils.common.utils : byLineSourceRange, ByLineSourceRange;
160 
161     string programName;                /// Program name
162     ByLineSourceRange!() inputSources; /// Input Files
163     size_t[] keyFields;                /// -g, --group-by
164     bool hasHeader = false;            /// --header
165     bool writeHeader = false;          /// -w, --write-header
166     char inputFieldDelimiter = '\t';   /// --d|delimiter
167     char valuesDelimiter = '|';        /// --v|values-delimiter
168     size_t floatPrecision = 12;        /// --p|float-precision
169     DList!Operator operators;          /// Operators, in the order specified.
170     size_t endFieldIndex = 0;          /// Derived value. Max field index used plus one.
171     MissingFieldPolicy globalMissingPolicy = new MissingFieldPolicy;   /// Derived value.
172 
173     /* tsv-summarize operators require access to the header line when the operator is
174      * created. This is because named fields may be used to describe fields names. To
175      * enable this, a CmdOptionHandler delegate is added to the cmdLinOperatorOptions
176      * array during during initial processing by std.getopt. The group-by operation is
177      * similar, but is added to the cmdLineOtherFieldOptions instead. At least one
178      * cmdLineOperatorOptions entry is required.
179      *
180      * The different handlers are defined after processArgs.
181      */
182 
183     /* CmdOptionHandler delegate signature - This is the call made to process the command
184      * line option arguments after the header line has been read.
185      */
186     alias CmdOptionHandler = void delegate(bool hasHeader, string[] headerFields);
187 
188     private CmdOptionHandler[]  cmdLineOperatorOptions;
189     private CmdOptionHandler[]  cmdLineOtherFieldOptions;
190 
191     /* Returns a tuple. First value is true if command line arguments were successfully
192      * processed and execution should continue, or false if an error occurred or the user
193      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
194      *
195      * Returning true (execution continues) means args have been validated and derived
196      * values calculated. In addition, field indices have been converted to zero-based.
197      */
198     auto processArgs (ref string[] cmdArgs) {
199         import std.algorithm : any, each;
200         import std.getopt;
201         import std.path : baseName, stripExtension;
202         import std.typecons : Yes, No;
203         import tsv_utils.common.fieldlist : fieldListHelpText;
204         import tsv_utils.common.getopt_inorder;
205         import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix;
206 
207         bool helpVerbose = false;          // --help-verbose
208         bool helpFields = false;           // --help-fields
209         bool versionWanted = false;        // --V|version
210         bool excludeMissing = false;       // --x|exclude-missing
211         string missingValueReplacement;    // --r|replace-missing
212 
213 
214         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
215 
216         try
217         {
218             arraySep = ",";    // Use comma to separate values in command line options
219             auto r = getoptInorder(
220                 cmdArgs,
221                 "help-verbose",       "              Print full help.", &helpVerbose,
222                 "help-fields",        "              Print help on specifying fields.", &helpFields,
223 
224                 std.getopt.config.caseSensitive,
225                 "V|version",          "              Print version information and exit.", &versionWanted,
226                 std.getopt.config.caseInsensitive,
227 
228                 "g|group-by",         "<field-list>  Fields to use as key.", &addGroupByOptionHandler,
229 
230                 std.getopt.config.caseSensitive,
231                 "H|header",           "              Treat the first line of each file as a header.", &hasHeader,
232                 std.getopt.config.caseInsensitive,
233 
234                 "w|write-header",     "              Write an output header even if there is no input header.", &writeHeader,
235                 "d|delimiter",        "CHR           Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &inputFieldDelimiter,
236                 "v|values-delimiter", "CHR           Values delimiter. Default: vertical bar (|). (Single byte UTF-8 characters only.)", &valuesDelimiter,
237                 "p|float-precision",  "NUM           'Precision' to use printing floating point numbers. Affects the number of digits printed and exponent use. Default: 12", &floatPrecision,
238                 "x|exclude-missing",  "              Exclude missing (empty) fields from calculations.", &excludeMissing,
239                 "r|replace-missing",  "STR           Replace missing (empty) fields with STR in calculations.", &missingValueReplacement,
240                 "count",              "              Count occurrences of each unique key ('--g|group-by'), or the total number of records if no key field is specified.", &addCountOptionHandler,
241                 "count-header",       "STR           Count occurrences of each unique key, like '--count', but use STR as the header.", &addCountHeaderOptionHandler,
242                 "retain",             "<field-list>  Retain one copy of the field.", &addOperatorOptionHandler!RetainOperator,
243                 "first",              "<field-list>[:STR]  First value seen.", &addOperatorOptionHandler!FirstOperator,
244                 "last",               "<field-list>[:STR]  Last value seen.", &addOperatorOptionHandler!LastOperator,
245                 "min",                "<field-list>[:STR]  Min value. (Fields with numeric values only.)", &addOperatorOptionHandler!MinOperator,
246                 "max",                "<field-list>[:STR]  Max value. (Fields with numeric values only.)", &addOperatorOptionHandler!MaxOperator,
247                 "range",              "<field-list>[:STR]  Difference between min and max values. (Fields with numeric values only.)", &addOperatorOptionHandler!RangeOperator,
248                 "sum",                "<field-list>[:STR]  Sum of the values. (Fields with numeric values only.)", &addOperatorOptionHandler!SumOperator,
249                 "mean",               "<field-list>[:STR]  Mean (average). (Fields with numeric values only.)", &addOperatorOptionHandler!MeanOperator,
250                 "median",             "<field-list>[:STR]  Median value. (Fields with numeric values only. Reads all values into memory.)", &addOperatorOptionHandler!MedianOperator,
251                 "quantile",           "<field-list>:p[,p...][:STR]  Quantiles. One or more fields, then one or more 0.0-1.0 probabilities. (Fields with numeric values only. Reads all values into memory.)", &addQuantileOperatorOptionHandler,
252                 "mad",                "<field-list>[:STR]  Median absolute deviation from the median. Raw value, not scaled. (Fields with numeric values only. Reads all values into memory.)", &addOperatorOptionHandler!MadOperator,
253                 "var",                "<field-list>[:STR]  Variance. (Sample variance, numeric fields only).", &addOperatorOptionHandler!VarianceOperator,
254                 "stdev",              "<field-list>[:STR]  Standard deviation. (Sample st.dev, numeric fields only).", &addOperatorOptionHandler!StDevOperator,
255                 "mode",               "<field-list>[:STR]  Mode. The most frequent value. (Reads all unique values into memory.)", &addOperatorOptionHandler!ModeOperator,
256                 "mode-count",         "<field-list>[:STR]  Count of the most frequent value. (Reads all unique values into memory.)", &addOperatorOptionHandler!ModeCountOperator,
257                 "unique-count",       "<field-list>[:STR]  Number of unique values. (Reads all unique values into memory.)", &addOperatorOptionHandler!UniqueCountOperator,
258                 "missing-count",      "<field-list>[:STR]  Number of missing (empty) fields. Not affected by '--x|exclude-missing' or '--r|replace-missing'.", &addOperatorOptionHandler!MissingCountOperator,
259                 "not-missing-count",  "<field-list>[:STR]  Number of filled (non-empty) fields. Not affected by '--r|replace-missing'.", &addOperatorOptionHandler!NotMissingCountOperator,
260                 "values",             "<field-list>[:STR]  All the values, separated by --v|values-delimiter. (Reads all values into memory.)", &addOperatorOptionHandler!ValuesOperator,
261                 "unique-values",      "<field-list>[:STR]  All the unique values, separated by --v|values-delimiter. (Reads all unique values into memory.)", &addOperatorOptionHandler!UniqueValuesOperator,
262                 );
263 
264             if (r.helpWanted)
265             {
266                 defaultGetoptPrinter(helpText, r.options);
267                 return tuple(false, 0);
268             }
269             else if (helpVerbose)
270             {
271                 defaultGetoptPrinter(helpTextVerbose, r.options);
272                 return tuple(false, 0);
273             }
274             else if (helpFields)
275             {
276                 writeln(fieldListHelpText);
277                 return tuple(false, 0);
278             }
279             else if (versionWanted)
280             {
281                 import tsv_utils.common.tsvutils_version;
282                 writeln(tsvutilsVersionNotice("tsv-summarize"));
283                 return tuple(false, 0);
284             }
285 
286             /* Remaining command line args are files. Use standard input if files
287              * were not provided. Truncate cmdArgs to consume the arguments.
288              */
289             string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
290             cmdArgs.length = 1;
291 
292             /* Validation and derivations - Do as much validation prior to header line
293              * processing as possible (avoids waiting on stdin).
294              */
295 
296             enforce(!cmdLineOperatorOptions.empty, "At least one summary operator is required.");
297 
298             enforce(inputFieldDelimiter != valuesDelimiter,
299                     "Cannot use the same character for both --d|field-delimiter and --v|values-delimiter.");
300 
301             enforce(!(excludeMissing && missingValueReplacement.length != 0),
302                     "Cannot use both '--x|exclude-missing' and '--r|replace-missing'.");
303 
304             /* Missing field policy. */
305             globalMissingPolicy.updatePolicy(excludeMissing, missingValueReplacement);
306 
307             string[] headerFields;
308 
309             /* fieldListArgProcessing encapsulates the field list processing. It is
310              * called prior to reading the header line if headers are not being used,
311              * and after if headers are being used.
312              */
313             void fieldListArgProcessing()
314             {
315                 /* Run all the operator handlers. */
316                 cmdLineOtherFieldOptions.each!(dg => dg(hasHeader, headerFields));
317                 cmdLineOperatorOptions.each!(dg => dg(hasHeader, headerFields));
318 
319                 /* keyFields need to be part of the endFieldIndex, which is one past
320                  * the last field index. */
321                 keyFields.each!(delegate (size_t x)
322                                 {
323                                     if (x >= endFieldIndex) endFieldIndex = x + 1;
324                                 } );
325             }
326 
327             if (!hasHeader) fieldListArgProcessing();
328 
329             /*
330              * Create the byLineSourceRange and perform header line processing.
331              */
332             inputSources = byLineSourceRange(filepaths);
333 
334 
335             if (hasHeader)
336             {
337                 if (!inputSources.front.byLine.empty)
338                 {
339                     throwIfWindowsNewlineOnUnix(inputSources.front.byLine.front, inputSources.front.name, 1);
340                     headerFields = inputSources.front.byLine.front.split(inputFieldDelimiter).to!(string[]);
341                 }
342 
343                 fieldListArgProcessing();
344             }
345         }
346         catch (Exception exc)
347         {
348             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
349             return tuple(false, 1);
350         }
351         return tuple(true, 0);
352     }
353 
354     private void addGroupByOptionHandler(string option, string optionVal)
355     {
356         cmdLineOtherFieldOptions ~=
357             (bool hasHeader, string[] headerFields)
358             => groupByOptionHandler(hasHeader, headerFields, option, optionVal);
359     }
360 
361     private void groupByOptionHandler(bool hasHeader, string[] headerFields, string option, string optionVal)
362     {
363         import tsv_utils.common.fieldlist;
364 
365         try
366         {
367             keyFields =
368                 optionVal
369                 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex)(hasHeader, headerFields)
370                 .array;
371         }
372         catch (Exception e)
373         {
374             e.msg = format("[--%s %s]. %s", option, optionVal, e.msg);
375             throw e;
376         }
377     }
378 
379     private void addOperatorOptionHandler(OperatorClass : SingleFieldOperator)(string option, string optionVal)
380     {
381         cmdLineOperatorOptions ~=
382             (bool hasHeader, string[] headerFields)
383             => operatorOptionHandler!OperatorClass(hasHeader, headerFields, option, optionVal);
384     }
385 
386     /* operationOptionHandler functions are callbacks that process command line options
387      * specifying summarization operations. eg. '--max 5', '--last 3:LastEntry'. Handlers
388      * check syntactic correctness and instantiate Operator objects that do the work. This
389      * is also where 1-upped field numbers are converted to 0-based indices.
390      */
391     private void operatorOptionHandler(OperatorClass : SingleFieldOperator)
392     (bool hasHeader, string[] headerFields, string option, string optionVal)
393     {
394         import std.range : enumerate;
395         import std.typecons : Yes, No;
396         import tsv_utils.common.fieldlist;
397 
398         try
399         {
400             auto optionValParse =
401                 optionVal
402                 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString)
403                 (hasHeader, headerFields);
404 
405             auto fieldIndices = optionValParse.array;
406             bool hasOptionalHeader = optionVal.length > optionValParse.consumed;
407             string optionalHeader;
408 
409             if (hasOptionalHeader)
410             {
411                 enforce(fieldIndices.length <= 1, "Cannot specify a custom header when using multiple fields.");
412                 enforce(optionVal.length - optionValParse.consumed > 1,
413                         format("No value after field list.\n   Expected: '--%s <field-list>' or '--%s <field>:<header>'.",
414                                option, option));
415                 optionalHeader = optionVal[optionValParse.consumed + 1 .. $].idup;
416             }
417 
418             foreach (fieldIndex; fieldIndices)
419             {
420                 auto op = new OperatorClass(fieldIndex, globalMissingPolicy);
421 
422                 if (hasOptionalHeader)
423                 {
424                     enforce(op.allowCustomHeader, "Operator does not support custom headers.");
425                     op.setCustomHeader(optionalHeader);
426                 }
427 
428                 operators.insertBack(op);
429                 if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1;
430             }
431         }
432         catch (Exception exc)
433         {
434             import std.format : format;
435             exc.msg = format("[--%s %s] %s", option, optionVal, exc.msg);
436             throw exc;
437         }
438     }
439 
440     private void addQuantileOperatorOptionHandler(string option, string optionVal)
441     {
442         cmdLineOperatorOptions ~=
443             (bool hasHeader, string[] headerFields)
444             => quantileOperatorOptionHandler(hasHeader, headerFields, option, optionVal);
445     }
446 
447     /* QuantileOperator has a different syntax and needs a custom command option handler. */
448     private void quantileOperatorOptionHandler(bool hasHeader, string[] headerFields, string option, string optionVal)
449     {
450         import std.typecons : Yes, No;
451         import tsv_utils.common.fieldlist;
452 
453         try
454         {
455             auto optionValParse =
456                 optionVal
457                 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString)
458                 (hasHeader, headerFields);
459 
460             auto fieldIndices = optionValParse.array;
461             enforce(optionVal.length - optionValParse.consumed > 1, "No probabilities entered.");
462 
463             auto splitRemaining =
464                 optionVal[optionValParse.consumed + 1 .. $]
465                 .findSplit(":");
466 
467             enforce(splitRemaining[1].empty || !splitRemaining[2].empty,
468                     "Empty custom header.");
469 
470             auto probStr = splitRemaining[0];
471             auto header = splitRemaining[2];
472 
473             double[] probs;
474 
475             foreach (str; probStr.splitter(','))
476             {
477                 double p = str.to!double;
478                 enforce(p >= 0.0 && p <= 1.0,
479                         format("Probability '%g' is not in the interval [0.0,1.0].", p));
480                 probs ~= p;
481             }
482 
483             enforce(header.empty || (fieldIndices.length <= 1 && probs.length <= 1),
484                     format("Cannot specify a custom header when using multiple fields or multiple probabilities."));
485 
486             assert (fieldIndices.length > 0);
487             assert (probs.length > 0);
488             assert (header.empty || (fieldIndices.length == 1 && probs.length == 1));
489 
490             foreach (fieldIndex; fieldIndices)
491             {
492                 foreach (p; probs)
493                 {
494                     auto op = new QuantileOperator(fieldIndex, globalMissingPolicy, p);
495                     if (!header.empty) op.setCustomHeader(header);
496                     operators.insertBack(op);
497                 }
498                 if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1;
499             }
500         }
501         catch (Exception e)
502         {
503             e.msg = format(
504                 "[--%s %s]. %s\n   Expected: '--%s <field-list>:<prob>[,<prob>]' or '--%s <field>:<prob>:<header>' where <prob> is a number between 0.0 and 1.0.",
505                 option, optionVal, e.msg, option, option);
506             throw e;
507         }
508 
509     }
510 
511     private void addCountOptionHandler()
512     {
513         cmdLineOperatorOptions ~=
514             (bool hasHeader, string[] headerFields)
515             => countOptionHandler(hasHeader, headerFields);
516     }
517 
518     private void countOptionHandler(bool hasHeader, string[] headerFields)
519     {
520         operators.insertBack(new CountOperator());
521     }
522 
523    private  void addCountHeaderOptionHandler(string option, string optionVal)
524     {
525         cmdLineOperatorOptions ~=
526             (bool hasHeader, string[] headerFields)
527             => countHeaderOptionHandler(hasHeader, headerFields, option, optionVal);
528     }
529 
530     private void countHeaderOptionHandler(bool hasHeader, string[] headerFields, string option, string optionVal)
531     {
532         auto op = new CountOperator();
533         op.setCustomHeader(optionVal);
534         operators.insertBack(op);
535     }
536 }
537 
538 /** tsvSummarize does the primary work of the tsv-summarize program.
539  */
540 void tsvSummarize(ref TsvSummarizeOptions cmdopt)
541 {
542     import tsv_utils.common.utils : BufferedOutputRange, ByLineSourceRange,
543         bufferedByLine, throwIfWindowsNewlineOnUnix;
544 
545     /* Check that the input files were setup as expected. Should at least have one
546      * input, stdin if nothing else, and newlines removed from the byLine range.
547      */
548     assert(!cmdopt.inputSources.empty);
549     static assert(is(typeof(cmdopt.inputSources) == ByLineSourceRange!(No.keepTerminator)));
550 
551     /* BufferedOutputRange is faster than writing directly to stdout if many lines are
552      * being written. This will happen mostly when group-by is used.
553      */
554     auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout);
555 
556     /* Pick the Summarizer based on the number of key-fields entered. */
557     auto summarizer =
558         (cmdopt.keyFields.length == 0)
559         ? new NoKeySummarizer!(typeof(bufferedOutput))(
560             cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
561 
562         : (cmdopt.keyFields.length == 1)
563         ? new OneKeySummarizer!(typeof(bufferedOutput))(
564             cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
565 
566         : new MultiKeySummarizer!(typeof(bufferedOutput))(
567             cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy);
568 
569     /* Add the operators to the Summarizer. */
570     summarizer.setOperators(inputRangeObject(cmdopt.operators[]));
571 
572     /* If there's no input header line, but writing an output header anyway, then
573      * write it now. This helps tasks further on in a unix pipeline detect errors
574      * quickly, without waiting for all the data to flow through the pipeline.
575      */
576     auto printOptions = SummarizerPrintOptions(
577         cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision);
578 
579     if (!cmdopt.hasHeader && cmdopt.writeHeader)
580     {
581         summarizer.writeSummaryHeader(bufferedOutput, printOptions);
582         bufferedOutput.flush;
583     }
584 
585     /* Process each input file, one line at a time. */
586     auto lineFields = new char[][](cmdopt.endFieldIndex);
587     bool headerFound = false;
588     foreach (inputStream; cmdopt.inputSources)
589     {
590         foreach (lineNum, line; inputStream.byLine.enumerate(1))
591         {
592             if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, lineNum);
593 
594             /* Copy the needed number of fields to the fields array.
595              * Note: The number is zero if no operator needs fields. Notably, the count
596              * operator. Used by itself, it counts the number input lines (ala 'wc -l').
597              */
598             if (cmdopt.endFieldIndex > 0)
599             {
600                 size_t fieldIndex = 0;
601                 foreach (fieldValue; line.splitter(cmdopt.inputFieldDelimiter))
602                 {
603                     if (fieldIndex == cmdopt.endFieldIndex) break;
604                     lineFields[fieldIndex] = fieldValue;
605                     fieldIndex++;
606                 }
607 
608                 if (fieldIndex == 0)
609                 {
610                     assert(cmdopt.endFieldIndex > 0);
611                     assert(line.length == 0);
612 
613                     /* Bug work-around. Empty lines are not handled properly by splitter.
614                      *   - Bug: https://issues.dlang.org/show_bug.cgi?id=15735
615                      *   - Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030
616                      * This can arise for: '$ tsv-summarize -g 1 --count'. This counts the
617                      * unique values in field 1. If there's only one column, then an empty
618                      * line becomes an empty string for field 1. Work-around: Point to the
619                      * line. It's an empty string.
620                      */
621                     lineFields[fieldIndex] = line;
622                     fieldIndex++;
623                 }
624 
625                 enforce(fieldIndex >= cmdopt.endFieldIndex,
626                         format("Not enough fields in line. File: %s, Line: %s",
627                                inputStream.name, lineNum));
628             }
629 
630             if (cmdopt.hasHeader && lineNum == 1)
631             {
632                 if (!headerFound)
633                 {
634                     summarizer.processHeaderLine(lineFields);
635                     headerFound = true;
636 
637                     /* Write the header now. This helps tasks further on in a unix
638                      * pipeline detect errors quickly, without waiting for all the
639                      * data to flow through the pipeline. Note that an upstream task
640                      * may have flushed its header line, so the header may arrive
641                      * long before the main block of data.
642                      */
643                     summarizer.writeSummaryHeader(bufferedOutput, printOptions);
644                     bufferedOutput.flush;
645                 }
646             }
647             else
648             {
649                 /* Process the line. Processing will fail (throw) if a field cannot be
650                  * converted to the expected type.
651                  */
652                 try summarizer.processNextLine(lineFields);
653                 catch (Exception exc)
654                 {
655                     throw new Exception(
656                         format("Could not process line or field: %s\n  File: %s Line: %s%s",
657                                exc.msg, inputStream.name, lineNum,
658                                (lineNum == 1) ? "\n  Is this a header line? Use --header to skip." : ""));
659                 }
660             }
661         }
662     }
663 
664     debug writeln("[tsvSummarize] After reading all data.");
665 
666     /* Whew! We're done processing input data. Run the calculations and print. */
667 
668     summarizer.writeSummaryBody(bufferedOutput, printOptions);
669 }
670 
671 /** The default field header. This is used when the input doesn't have field headers,
672  * but field headers are used in the output. The default is "fieldN", where N is the
673  * 1-upped field number.
674  */
675 string fieldHeaderFromIndex(size_t fieldIndex)
676 {
677     enum prefix = "field";
678     return prefix ~ (fieldIndex + 1).to!string;
679 }
680 
681 unittest
682 {
683     assert(fieldHeaderFromIndex(0) == "field1");
684     assert(fieldHeaderFromIndex(10) == "field11");
685 }
686 
687 /** Produce a summary header from a field header.
688  *
689  * The result has the form `<fieldHeader>_<operation>`. e.g. If the field header is
690  * "length" and the operation is "max", the summary header is "length_max". The field
691  * header typically comes a header line in the input data or was constructed by
692  * fieldHeaderFromIndex().
693  *
694  * If operationName is the empty string, then fieldHeader is used unchanged. This supports
695  * the Retain operator.
696  */
697 string summaryHeaderFromFieldHeader(string fieldHeader, string operationName)
698 {
699     return (operationName.length > 0) ? fieldHeader ~ "_" ~ operationName : fieldHeader;
700 }
701 
702 unittest
703 {
704     assert(summaryHeaderFromFieldHeader("originalfield", "mycalc") == "originalfield_mycalc");
705     assert(summaryHeaderFromFieldHeader("originalfield", "") == "originalfield");
706 }
707 
708 /** SummarizerPrintOptions holds printing options for Summarizers and Calculators. Typically
709  * specified with command line options, it is separated out for modularity.
710  */
711 struct SummarizerPrintOptions
712 {
713     char fieldDelimiter;
714     char valuesDelimiter;
715     size_t floatPrecision = 12;
716 
717     import std.traits : isFloatingPoint, isIntegral;
718 
719     auto formatNumber(T)(T n) const
720     if (isFloatingPoint!T || isIntegral!T)
721     {
722         import tsv_utils.common.numerics : formatNumber;
723         return formatNumber!T(n, floatPrecision);
724     }
725 }
726 
727 /** A Summarizer object maintains the state of the summarization and performs basic
728  * processing. Handling of files and input lines is left to the caller.
729  *
730  * Classes supporting the Summarizer must implement the methods:
731  *  - setOperators - Called after initializing the object for each operator to be processed.
732  *  - processHeaderLine - Called to process the header line of each file. Returns true if
733  *   it was the first header line processed (used when reading multiple files).
734  * - processNextLine - Called to process non-header lines.
735  * - writeSummaryHeader - Called to write the header line.
736  * - writeSummaryBody - Called to write the result lines.
737  *
738  */
739 interface Summarizer(OutputRange)
740 {
741     /** Called after initializing the object for each operator to be processed. */
742     void setOperators(InputRange!Operator op);
743 
744     /** Called to process the header line of each file. Returns true if it was the
745      *  first header line processed (used when reading multiple files).
746      */
747     bool processHeaderLine(const char[][] lineFields);
748 
749     /** Called to process non-header lines. */
750     void processNextLine(const char[][] lineFields);
751 
752     /** Called to write the header line. */
753     void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions);
754 
755     /** Called to write the result lines. */
756     void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions);
757 }
758 
759 /** SummarizerBase performs work shared by all sumarizers, most everything except for
760  * handling of unique keys.
761  *
762  * The base class handles creation, allocates storage for Operators and SharedFieldValues,
763  * and similar. Derived classes deal primarily with unique keys and the associated Calculators
764  * and UniqueKeyValuesLists.
765  */
766 class SummarizerBase(OutputRange) : Summarizer!OutputRange
767 {
768     private char _inputFieldDelimiter;
769     private bool _hasProcessedFirstHeaderLine = false;
770     private SharedFieldValues _sharedFieldValues = null;  // Null if no shared field value lists.
771     protected MissingFieldPolicy _missingPolicy;
772     protected DList!Operator _operators;
773     protected size_t _numOperators = 0;
774 
775     this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
776     {
777         _inputFieldDelimiter = inputFieldDelimiter;
778         _missingPolicy = missingPolicy;
779     }
780 
781     char inputFieldDelimiter() const @property
782     {
783         return _inputFieldDelimiter;
784     }
785 
786     /** Sets the Operators used by the Summarizer. Called after construction. */
787     void setOperators(InputRange!Operator operators)
788     {
789         foreach (op; operators)
790         {
791             _operators.insertBack(op);
792             _numOperators++;
793             auto numericFieldsToSave = op.numericFieldsToSave();
794             auto textFieldsToSave = op.textFieldsToSave();
795 
796             if (numericFieldsToSave.length > 0 || textFieldsToSave.length > 0)
797             {
798                 if (_sharedFieldValues is null)
799                 {
800                     _sharedFieldValues = new SharedFieldValues();
801                 }
802                 numericFieldsToSave.each!(x => _sharedFieldValues.addNumericIndex(x));
803                 textFieldsToSave.each!(x => _sharedFieldValues.addTextIndex(x));
804             }
805         }
806     }
807 
808     /** Called to process the header line of each file. Returns true if it was the
809      *  first header line processed (used when reading multiple files).
810      */
811     bool processHeaderLine(const char[][] lineFields)
812     {
813         if (!_hasProcessedFirstHeaderLine)
814         {
815             _operators.each!(x => x.processHeaderLine(lineFields));
816             _hasProcessedFirstHeaderLine = true;
817             return true;
818         }
819         else
820         {
821             return false;
822         }
823     }
824 
825     protected final UniqueKeyValuesLists makeUniqueKeyValuesLists()
826     {
827         return (_sharedFieldValues is null)
828             ? null
829             : _sharedFieldValues.makeUniqueKeyValuesLists;
830     }
831 
832     abstract void processNextLine(const char[][] lineFields);
833     abstract void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions);
834     abstract void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions);
835 }
836 
837 /** The NoKeySummarizer is used when summarizing values across the entire input.
838  *
839  * Note: NoKeySummarizer is used in Operator unit tests and gets extensive testing
840  * through that mechanism.
841  */
842 final class NoKeySummarizer(OutputRange) : SummarizerBase!OutputRange
843 {
844     private Calculator[] _calculators;
845     private UniqueKeyValuesLists _valueLists;
846 
847     this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
848     {
849         super(inputFieldDelimiter, missingPolicy);
850     }
851 
852     /** Called after initializing the object for each operator to be processed. */
853     override void setOperators(InputRange!Operator operators)
854     {
855         super.setOperators(operators);
856 
857         /* Only one Calculator per Operation, so create them as Operators are added. */
858         foreach (op; operators) _calculators ~= op.makeCalculator;
859         _valueLists = super.makeUniqueKeyValuesLists();
860     }
861 
862      /** Called to process non-header lines. */
863     override void processNextLine(const char[][] lineFields)
864     {
865         _calculators.each!(x => x.processNextLine(lineFields));
866         if (_valueLists !is null) _valueLists.processNextLine(lineFields, _missingPolicy);
867     }
868 
869     /** Called to write the header line. */
870     override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
871     {
872         put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter));
873         put(outputStream, '\n');
874     }
875 
876     /** Called to write the result lines. */
877     override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
878     {
879         put(outputStream,
880             _calculators[]
881             .map!(x => x.calculate(_valueLists, printOptions))
882             .join(printOptions.fieldDelimiter));
883         put(outputStream, '\n');
884     }
885 }
886 
887 /** KeySummarizerBase does work shared by the single key and multi-key summarizers.
888  *
889  * The primary difference between those two is the formation of the key. The primary
890  * reason for separating those into two separate classes is to simplify (speed-up)
891  * handling of single field keys, which are the most common use case.
892  */
893 class KeySummarizerBase(OutputRange) : SummarizerBase!OutputRange
894 {
895     protected struct UniqueKeyData
896     {
897         Calculator[] calculators;
898         UniqueKeyValuesLists valuesLists;
899     }
900 
901     private DList!string _uniqueKeys;
902     private UniqueKeyData[string] _uniqueKeyData;
903 
904     this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
905     {
906         super(inputFieldDelimiter, missingPolicy);
907     }
908 
909     protected void processNextLineWithKey(T : const char[])(T key, const char[][] lineFields)
910     {
911         debug writefln("[%s]: %s", __FUNCTION__, lineFields.to!string);
912 
913         auto dataPtr = (key in _uniqueKeyData);
914         auto data = (dataPtr is null) ? addUniqueKey(key.to!string) : *dataPtr;
915 
916         data.calculators.each!(x => x.processNextLine(lineFields));
917         if (data.valuesLists !is null) data.valuesLists.processNextLine(lineFields, _missingPolicy);
918     }
919 
920     protected UniqueKeyData addUniqueKey(string key)
921     {
922         assert(key !in _uniqueKeyData);
923 
924         _uniqueKeys.insertBack(key);
925 
926         auto calculators = new Calculator[_numOperators];
927         size_t i = 0;
928         foreach (op; _operators)
929         {
930             calculators[i] = op.makeCalculator;
931             i++;
932         }
933 
934         return _uniqueKeyData[key] = UniqueKeyData(calculators, super.makeUniqueKeyValuesLists());
935     }
936 
937     override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
938     {
939         put(outputStream, keyFieldHeader());
940         put(outputStream, printOptions.fieldDelimiter);
941         put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter));
942         put(outputStream, '\n');
943     }
944 
945     override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
946     {
947         foreach(key; _uniqueKeys)
948         {
949             auto data = _uniqueKeyData[key];
950             put(outputStream, key);
951             put(outputStream, printOptions.fieldDelimiter);
952             put(outputStream,
953                 data.calculators[]
954                 .map!(x => x.calculate(data.valuesLists, printOptions))
955                 .join(printOptions.fieldDelimiter));
956             put(outputStream, '\n');
957         }
958     }
959 
960     abstract string keyFieldHeader() const @property;
961 }
962 
963 /** This Summarizer is for the case where the unique key is based on exactly one field.
964  */
965 final class OneKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange
966 {
967     private size_t _keyFieldIndex = 0;
968     private string _keyFieldHeader;
969     private DList!string _uniqueKeys;
970 
971     this(size_t keyFieldIndex, char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
972     {
973         super(inputFieldDelimiter, missingPolicy);
974         _keyFieldIndex = keyFieldIndex;
975         _keyFieldHeader = fieldHeaderFromIndex(keyFieldIndex);
976     }
977 
978     override string keyFieldHeader() const @property
979     {
980         return _keyFieldHeader;
981     }
982 
983     override bool processHeaderLine(const char[][] lineFields)
984     {
985         assert(_keyFieldIndex <= lineFields.length);
986 
987         bool isFirstHeaderLine = super.processHeaderLine(lineFields);
988         if (isFirstHeaderLine)
989         {
990             _keyFieldHeader = lineFields[_keyFieldIndex].to!string;
991         }
992         return isFirstHeaderLine;
993     }
994 
995     override void processNextLine(const char[][] lineFields)
996     {
997         assert(_keyFieldIndex < lineFields.length);
998         processNextLineWithKey(lineFields[_keyFieldIndex], lineFields);
999     }
1000 }
1001 
1002 /** This Summarizer is for the case where the unique key is based on multiple fields.
1003  */
1004 final class MultiKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange
1005 {
1006     private size_t[] _keyFieldIndices;
1007     private string _keyFieldHeader;
1008     private DList!string _uniqueKeys;
1009 
1010     this(const size_t[] keyFieldIndices, char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
1011     {
1012         super(inputFieldDelimiter, missingPolicy);
1013         _keyFieldIndices = keyFieldIndices.dup;
1014         _keyFieldHeader =
1015             _keyFieldIndices.map!(i => fieldHeaderFromIndex(i))
1016             .join(inputFieldDelimiter);
1017     }
1018 
1019     override string keyFieldHeader() const @property
1020     {
1021         return _keyFieldHeader;
1022     }
1023 
1024     override bool processHeaderLine(const char[][] lineFields)
1025     {
1026         assert(_keyFieldIndices.all!(x => x < lineFields.length));
1027         assert(_keyFieldIndices.length >= 2);
1028 
1029         bool isFirstHeaderLine = super.processHeaderLine(lineFields);
1030         if (isFirstHeaderLine)
1031         {
1032             _keyFieldHeader = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string;
1033         }
1034         return isFirstHeaderLine;
1035     }
1036 
1037     override void processNextLine(const char[][] lineFields)
1038     {
1039         assert(_keyFieldIndices.all!(x => x < lineFields.length));
1040         assert(_keyFieldIndices.length >= 2);
1041 
1042         string key = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string;
1043         processNextLineWithKey(key, lineFields);
1044     }
1045 }
1046 
1047 version(unittest)
1048 {
1049     /* testSummarizer is a helper that can run many types of unit tests against
1050      * Summarizers. It can also test operators, but there are separate helper functions
1051      * better suited for that purpose.
1052      *
1053      * Arguments are a command line args, an input file, and expected output. The
1054      * input file and expected output are already split into lines and fields, the helper
1055      * manages re-assembly. The program name from the command line args is printed if an
1056      * an error occurs, it is useful to identify the test that failed.
1057      *
1058      * Note: Much of this is a duplication tsvSummarize logic. Better abstraction of
1059      * file input/output would enable running unit tests directly on top of tsvSummarize.
1060      *
1061      * Update (April 2020): With the introduction of InputSourceRange and ByLineSource,
1062      * there needs to be a physical file when call processArgs. Its hard to get around,
1063      * as the intent is to read the header line of the first input file during command
1064      * line argument processing. Eventually this unit test process will need to be
1065      * rewritten. For now, a file with the equivalent data is being added to the command
1066      * line.
1067      */
1068     void testSummarizer(string[] cmdArgs, string[][] file, string[][] expected)
1069     {
1070         import std.array : appender;
1071 
1072         assert(cmdArgs.length > 0, "[testSummarizer] cmdArgs must not be empty.");
1073 
1074         auto formatAssertMessage(T...)(string msg, T formatArgs)
1075         {
1076             auto formatString = "[testSummarizer] %s: " ~ msg;
1077             return format(formatString, cmdArgs[0], formatArgs);
1078         }
1079 
1080         TsvSummarizeOptions cmdopt;
1081         auto savedCmdArgs = cmdArgs.to!string;
1082         auto r = cmdopt.processArgs(cmdArgs);
1083         assert(r[0], formatAssertMessage("Invalid command line args: '%s'.", savedCmdArgs));
1084 
1085         assert(file.all!(line => line.length >= cmdopt.endFieldIndex),
1086                formatAssertMessage("group-by or operator field number greater than number of fields a line of the input file."));
1087 
1088         /* Pick the Summarizer based on the number of key-fields entered. */
1089         auto summarizer =
1090             (cmdopt.keyFields.length == 0)
1091             ? new NoKeySummarizer!(typeof(appender!(char[])()))(
1092                 cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
1093 
1094             : (cmdopt.keyFields.length == 1)
1095             ? new OneKeySummarizer!(typeof(appender!(char[])()))(
1096                 cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
1097 
1098             : new MultiKeySummarizer!(typeof(appender!(char[])()))(
1099                 cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy);
1100 
1101         /* Add the operators to the Summarizer. */
1102         summarizer.setOperators(inputRangeObject(cmdopt.operators[]));
1103 
1104         /* Process the file one line at a time. */
1105         auto lineFields = new char[][](cmdopt.endFieldIndex);
1106         bool headerFound = false;
1107         foreach (lineNum, line; file.enumerate(1))
1108         {
1109             /* Copy the needed fields to the fields array. */
1110             foreach (i, val; line[0..cmdopt.endFieldIndex]) lineFields[i] = val.dup;
1111 
1112             if (cmdopt.hasHeader && lineNum == 1)
1113             {
1114                 if (!headerFound)
1115                 {
1116                     summarizer.processHeaderLine(lineFields);
1117                     headerFound = true;
1118                 }
1119             }
1120             else
1121             {
1122                 try summarizer.processNextLine(lineFields);
1123                 catch (Exception exc)
1124                 {
1125                     assert(false, formatAssertMessage(exc.msg));
1126                 }
1127             }
1128         }
1129         auto printOptions = SummarizerPrintOptions(
1130         cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision);
1131 
1132         auto summarizerOutput = appender!(char[])();
1133 
1134         if (cmdopt.hasHeader || cmdopt.writeHeader)
1135         {
1136             summarizer.writeSummaryHeader(summarizerOutput, printOptions);
1137         }
1138 
1139         summarizer.writeSummaryBody(summarizerOutput, printOptions);
1140         auto expectedOutput = expected.map!(x => x.joiner(cmdopt.inputFieldDelimiter.to!string)).joiner("\n").to!string;
1141         if (expectedOutput.length > 0 && expectedOutput[$-1] != '\n') expectedOutput ~= "\n";
1142 
1143         assert(summarizerOutput.data == expectedOutput,
1144                formatAssertMessage(
1145                    "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
1146                    expectedOutput.to!string, summarizerOutput.data.to!string));
1147     }
1148 
1149     void writeDataFile(string filepath, string[][] fileData, string delimiter = "\t")
1150     {
1151         import std.algorithm;
1152         import std.stdio;
1153 
1154         auto f = filepath.File("w");
1155         foreach (record; fileData) f.writeln(record.joiner(delimiter));
1156         f.close;
1157     }
1158 }
1159 
1160 unittest
1161 {
1162     import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
1163     import std.file : mkdir, rmdirRecurse;
1164     import std.path : buildPath;
1165 
1166     auto testDir = makeUnittestTempDir("tsv_summarizer");
1167     scope(exit) testDir.rmdirRecurse;
1168 
1169     /* Summarizer unit tests. Primarily single-key and multi-key summarizers. To a limited
1170      * extent, command line option handling (TsvSummarizeOptions). Individual operators
1171      * have separate tests, those tests test the no-key summarizer. The Values operator is
1172      * used in these tests. It engages a number of behaviors, and the results have limited
1173      * ambiguity. Using only one operator limits dependence on individual operators.
1174      *
1175      * Update (April 2020): There now needs to be a real file passed to testSummarizer.
1176      * See the comments with testSummarizer for details.
1177      */
1178 
1179     auto file1 = [["fld1", "fld2", "fld3"],
1180                   ["a", "a",  "3"],
1181                   ["c", "a",  "2b"],
1182                   ["c", "bc", ""],
1183                   ["a", "c",  "2b"],
1184                   ["",  "bc", ""],
1185                   ["c", "bc", "3"]];
1186 
1187     auto file1Path = buildPath(testDir, "file1.tsv");
1188     auto file1NoHeaderPath = buildPath(testDir, "file1_noheader.tsv");
1189     writeDataFile(file1Path, file1);
1190     writeDataFile(file1NoHeaderPath, file1[1 .. $]);
1191 
1192     /* Single-key summarizer tests.
1193      */
1194     testSummarizer(["unittest-sk-1", "--header", "--group-by", "1", "--values", "1", file1Path],
1195                    file1,
1196                    [["fld1", "fld1_values"],
1197                     ["a", "a|a"],
1198                     ["c", "c|c|c"],
1199                     ["",  ""]]
1200         );
1201     testSummarizer(["unittest-sk-1-named", "--header", "--group-by", "fld1", "--values", "fld1", file1Path],
1202                    file1,
1203                    [["fld1", "fld1_values"],
1204                     ["a", "a|a"],
1205                     ["c", "c|c|c"],
1206                     ["",  ""]]
1207         );
1208     testSummarizer(["unittest-sk-2", "-H", "--group-by", "1", "--values", "2", file1Path],
1209                    file1,
1210                    [["fld1", "fld2_values"],
1211                     ["a", "a|c"],
1212                     ["c", "a|bc|bc"],
1213                     ["",  "bc"]]
1214         );
1215     testSummarizer(["unittest-sk-2-named", "-H", "--group-by", "fld1", "--values", "fld2", file1Path],
1216                    file1,
1217                    [["fld1", "fld2_values"],
1218                     ["a", "a|c"],
1219                     ["c", "a|bc|bc"],
1220                     ["",  "bc"]]
1221         );
1222     testSummarizer(["unittest-sk-3", "-H", "-g", "1", "--values", "3", file1Path],
1223                    file1,
1224                    [["fld1", "fld3_values"],
1225                     ["a", "3|2b"],
1226                     ["c", "2b||3"],
1227                     ["",  ""]]
1228         );
1229     testSummarizer(["unittest-sk-4", "-H", "--group-by", "1", "--values", "1,2,3", file1Path],
1230                    file1,
1231                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1232                     ["a", "a|a",   "a|c",     "3|2b"],
1233                     ["c", "c|c|c", "a|bc|bc", "2b||3"],
1234                     ["",  "",      "bc",      ""]]
1235         );
1236     testSummarizer(["unittest-sk-4-named-a", "-H", "--group-by", "fld1", "--values", "fld1,fld2,fld3", file1Path],
1237                    file1,
1238                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1239                     ["a", "a|a",   "a|c",     "3|2b"],
1240                     ["c", "c|c|c", "a|bc|bc", "2b||3"],
1241                     ["",  "",      "bc",      ""]]
1242         );
1243     testSummarizer(["unittest-sk-4-named-b", "-H", "--group-by", "fld1", "--values", "fld*", file1Path],
1244                    file1,
1245                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1246                     ["a", "a|a",   "a|c",     "3|2b"],
1247                     ["c", "c|c|c", "a|bc|bc", "2b||3"],
1248                     ["",  "",      "bc",      ""]]
1249         );
1250     testSummarizer(["unittest-sk-5", "-H", "--group-by", "1", "--values", "1-3", file1Path],
1251                    file1,
1252                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1253                     ["a", "a|a",   "a|c",     "3|2b"],
1254                     ["c", "c|c|c", "a|bc|bc", "2b||3"],
1255                     ["",  "",      "bc",      ""]]
1256         );
1257     testSummarizer(["unittest-sk-6", "-H", "--group-by", "1", "--values", "3,2,1", file1Path],
1258                    file1,
1259                    [["fld1", "fld3_values", "fld2_values", "fld1_values"],
1260                     ["a", "3|2b",  "a|c",     "a|a"],
1261                     ["c", "2b||3", "a|bc|bc", "c|c|c"],
1262                     ["",  "",      "bc",      ""]]
1263         );
1264     testSummarizer(["unittest-sk-7", "-H", "--group-by", "1", "--values", "3-1", file1Path],
1265                    file1,
1266                    [["fld1", "fld3_values", "fld2_values", "fld1_values"],
1267                     ["a", "3|2b",  "a|c",     "a|a"],
1268                     ["c", "2b||3", "a|bc|bc", "c|c|c"],
1269                     ["",  "",      "bc",      ""]]
1270         );
1271     testSummarizer(["unittest-sk-8", "-H", "--group-by", "2", "--values", "1", file1Path],
1272                    file1,
1273                    [["fld2", "fld1_values"],
1274                     ["a",  "a|c"],
1275                     ["bc", "c||c"],
1276                     ["c",  "a"]]
1277         );
1278     testSummarizer(["unittest-sk-9", "-H", "--group-by", "2", "--values", "2", file1Path],
1279                    file1,
1280                    [["fld2", "fld2_values"],
1281                     ["a",  "a|a"],
1282                     ["bc", "bc|bc|bc"],
1283                     ["c",  "c"]]
1284         );
1285     testSummarizer(["unittest-sk-10", "-H", "--group-by", "2", "--values", "3", file1Path],
1286                    file1,
1287                    [["fld2", "fld3_values"],
1288                     ["a",  "3|2b"],
1289                     ["bc", "||3"],
1290                     ["c",  "2b"]]
1291         );
1292     testSummarizer(["unittest-sk-11", "-H", "--group-by", "2", "--values", "1,3", file1Path],
1293                    file1,
1294                    [["fld2", "fld1_values", "fld3_values"],
1295                     ["a",  "a|c",  "3|2b"],
1296                     ["bc", "c||c", "||3"],
1297                     ["c",  "a",    "2b"]]
1298         );
1299     testSummarizer(["unittest-sk-12", "-H", "--group-by", "2", "--values", "3,1", file1Path],
1300                    file1,
1301                    [["fld2", "fld3_values", "fld1_values"],
1302                     ["a",  "3|2b", "a|c"],
1303                     ["bc", "||3",  "c||c"],
1304                     ["c",  "2b",   "a"]]
1305         );
1306     testSummarizer(["unittest-sk-13", "-H", "--group-by", "3", "--values", "1", file1Path],
1307                    file1,
1308                    [["fld3", "fld1_values"],
1309                     ["3",  "a|c"],
1310                     ["2b", "c|a"],
1311                     ["",   "c|"]]
1312         );
1313     testSummarizer(["unittest-sk-14", "-H", "--group-by", "3", "--values", "2", file1Path],
1314                    file1,
1315                    [["fld3", "fld2_values"],
1316                     ["3",  "a|bc"],
1317                     ["2b", "a|c"],
1318                     ["",   "bc|bc"]]
1319         );
1320     testSummarizer(["unittest-sk-15", "-H", "--group-by", "3", "--values", "1,2", file1Path],
1321                    file1,
1322                    [["fld3", "fld1_values", "fld2_values"],
1323                     ["3",  "a|c", "a|bc"],
1324                     ["2b", "c|a", "a|c"],
1325                     ["",   "c|",  "bc|bc"]]
1326         );
1327     testSummarizer(["unittest-sk-15-named", "-H", "--group-by", "fld3", "--values", "fld1,fld2", file1Path],
1328                    file1,
1329                    [["fld3", "fld1_values", "fld2_values"],
1330                     ["3",  "a|c", "a|bc"],
1331                     ["2b", "c|a", "a|c"],
1332                     ["",   "c|",  "bc|bc"]]
1333         );
1334 
1335     /* Multi-key summarizer tests.
1336      */
1337     testSummarizer(["unittest-mk-1", "--header", "--group-by", "1,2", "--values", "1", file1Path],
1338                    file1,
1339                    [["fld1", "fld2", "fld1_values"],
1340                     ["a", "a",  "a"],
1341                     ["c", "a",  "c"],
1342                     ["c", "bc", "c|c"],
1343                     ["a", "c",  "a"],
1344                     ["", "bc",  ""]]
1345         );
1346     testSummarizer(["unittest-mk-2", "-H", "--group-by", "1,2", "--values", "2", file1Path],
1347                    file1,
1348                    [["fld1", "fld2", "fld2_values"],
1349                     ["a", "a",  "a"],
1350                     ["c", "a",  "a"],
1351                     ["c", "bc", "bc|bc"],
1352                     ["a", "c",  "c"],
1353                     ["", "bc",  "bc"]]
1354         );
1355     testSummarizer(["unittest-mk-3", "-H", "--group-by", "1,2", "--values", "3", file1Path],
1356                    file1,
1357                    [["fld1", "fld2", "fld3_values"],
1358                     ["a", "a",  "3"],
1359                     ["c", "a",  "2b"],
1360                     ["c", "bc", "|3"],
1361                     ["a", "c",  "2b"],
1362                     ["", "bc",  ""]]
1363         );
1364     testSummarizer(["unittest-mk-4", "-H", "--group-by", "1,2", "--values", "3,1", file1Path],
1365                    file1,
1366                    [["fld1", "fld2", "fld3_values", "fld1_values"],
1367                     ["a", "a",  "3", "a"],
1368                     ["c", "a",  "2b", "c"],
1369                     ["c", "bc", "|3", "c|c"],
1370                     ["a", "c",  "2b", "a"],
1371                     ["",  "bc", "",   ""]]
1372         );
1373     testSummarizer(["unittest-mk-4-named", "-H", "--group-by", "fld1,fld2", "--values", "fld3,fld1", file1Path],
1374                    file1,
1375                    [["fld1", "fld2", "fld3_values", "fld1_values"],
1376                     ["a", "a",  "3", "a"],
1377                     ["c", "a",  "2b", "c"],
1378                     ["c", "bc", "|3", "c|c"],
1379                     ["a", "c",  "2b", "a"],
1380                     ["",  "bc", "",   ""]]
1381         );
1382     testSummarizer(["unittest-mk-5", "-H", "--group-by", "3,2", "--values", "1", file1Path],
1383                    file1,
1384                    [["fld3", "fld2", "fld1_values"],
1385                     ["3",  "a",  "a"],
1386                     ["2b", "a",  "c"],
1387                     ["",   "bc", "c|"],
1388                     ["2b", "c",  "a"],
1389                     ["3",  "bc", "c"]]
1390         );
1391     testSummarizer(["unittest-mk-6", "-H", "--group-by", "3-2", "--values", "1", file1Path],
1392                    file1,
1393                    [["fld3", "fld2", "fld1_values"],
1394                     ["3",  "a",  "a"],
1395                     ["2b", "a",  "c"],
1396                     ["",   "bc", "c|"],
1397                     ["2b", "c",  "a"],
1398                     ["3",  "bc", "c"]]
1399         );
1400     testSummarizer(["unittest-mk-7", "-H", "--group-by", "2,1,3", "--values", "2", file1Path],
1401                    file1,
1402                    [["fld2", "fld1", "fld3", "fld2_values"],
1403                     ["a",  "a", "3",  "a"],
1404                     ["a",  "c", "2b", "a"],
1405                     ["bc", "c", "",   "bc"],
1406                     ["c",  "a", "2b", "c"],
1407                     ["bc", "",  "",   "bc"],
1408                     ["bc", "c", "3",  "bc"]]
1409         );
1410 
1411     /* Missing policies. */
1412     testSummarizer(["unittest-mis-1", "--header", "--group-by", "1", "--values", "1", "--exclude-missing", file1Path],
1413                    file1,
1414                    [["fld1", "fld1_values"],
1415                     ["a", "a|a"],
1416                     ["c", "c|c|c"],
1417                     ["",  ""]]
1418         );
1419     testSummarizer(["unittest-mis-2", "-H", "--group-by", "1", "--values", "2", "-x", file1Path],
1420                    file1,
1421                    [["fld1", "fld2_values"],
1422                     ["a", "a|c"],
1423                     ["c", "a|bc|bc"],
1424                     ["",  "bc"]]
1425         );
1426     testSummarizer(["unittest-mis-3", "-H", "-g", "1", "--values", "3", "-x", file1Path],
1427                    file1,
1428                    [["fld1", "fld3_values"],
1429                     ["a", "3|2b"],
1430                     ["c", "2b|3"],
1431                     ["",  ""]]
1432         );
1433     testSummarizer(["unittest-mis-4", "-H", "--group-by", "1", "--values", "1,2,3", "-x", file1Path],
1434                    file1,
1435                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1436                     ["a", "a|a",   "a|c",     "3|2b"],
1437                     ["c", "c|c|c", "a|bc|bc", "2b|3"],
1438                     ["",  "",      "bc",      ""]]
1439         );
1440     testSummarizer(["unittest-mis-5", "--header", "--group-by", "1", "--values", "1", "--replace-missing", "NA", file1Path],
1441                    file1,
1442                    [["fld1", "fld1_values"],
1443                     ["a", "a|a"],
1444                     ["c", "c|c|c"],
1445                     ["",  "NA"]]
1446         );
1447     testSummarizer(["unittest-mis-6", "-H", "--group-by", "1", "--values", "2", "-r", "NA", file1Path],
1448                    file1,
1449                    [["fld1", "fld2_values"],
1450                     ["a", "a|c"],
1451                     ["c", "a|bc|bc"],
1452                     ["",  "bc"]]
1453         );
1454     testSummarizer(["unittest-mis-7", "-H", "-g", "1", "--values", "3", "-r", "NA", file1Path],
1455                    file1,
1456                    [["fld1", "fld3_values"],
1457                     ["a", "3|2b"],
1458                     ["c", "2b|NA|3"],
1459                     ["",  "NA"]]
1460         );
1461     testSummarizer(["unittest-mis-7-named", "-H", "-g", "fld1", "--values", "fld3", "-r", "NA", file1Path],
1462                    file1,
1463                    [["fld1", "fld3_values"],
1464                     ["a", "3|2b"],
1465                     ["c", "2b|NA|3"],
1466                     ["",  "NA"]]
1467         );
1468     testSummarizer(["unittest-mis-8", "-H", "--group-by", "1", "--values", "1,2,3", "-r", "NA", file1Path],
1469                    file1,
1470                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1471                     ["a", "a|a",   "a|c",     "3|2b"],
1472                     ["c", "c|c|c", "a|bc|bc", "2b|NA|3"],
1473                     ["",  "NA",      "bc",      "NA"]]
1474         );
1475     testSummarizer(["unittest-mis-9", "-H", "--group-by", "1,2", "--values", "3,1", "-x", file1Path],
1476                    file1,
1477                    [["fld1", "fld2", "fld3_values", "fld1_values"],
1478                     ["a", "a",  "3", "a"],
1479                     ["c", "a",  "2b", "c"],
1480                     ["c", "bc", "3", "c|c"],
1481                     ["a", "c",  "2b", "a"],
1482                     ["",  "bc", "",   ""]]
1483         );
1484     testSummarizer(["unittest-mis-10", "-H", "--group-by", "3,2", "--values", "1", "-x", file1Path],
1485                    file1,
1486                    [["fld3", "fld2", "fld1_values"],
1487                     ["3",  "a",  "a"],
1488                     ["2b", "a",  "c"],
1489                     ["",   "bc", "c"],
1490                     ["2b", "c",  "a"],
1491                     ["3",  "bc", "c"]]
1492         );
1493     testSummarizer(["unittest-mis-11", "-H", "--group-by", "2,1,3", "--values", "2", "-x", file1Path],
1494                    file1,
1495                    [["fld2", "fld1", "fld3", "fld2_values"],
1496                     ["a",  "a", "3",  "a"],
1497                     ["a",  "c", "2b", "a"],
1498                     ["bc", "c", "",   "bc"],
1499                     ["c",  "a", "2b", "c"],
1500                     ["bc", "",  "",   "bc"],
1501                     ["bc", "c", "3",  "bc"]]
1502         );
1503     testSummarizer(["unittest-mis-12", "-H", "--group-by", "1,2", "--values", "3,1", "-r", "NA", file1Path],
1504                    file1,
1505                    [["fld1", "fld2", "fld3_values", "fld1_values"],
1506                     ["a", "a",  "3", "a"],
1507                     ["c", "a",  "2b", "c"],
1508                     ["c", "bc", "NA|3", "c|c"],
1509                     ["a", "c",  "2b", "a"],
1510                     ["",  "bc", "NA",   "NA"]]
1511         );
1512     testSummarizer(["unittest-mis-13", "-H", "--group-by", "3,2", "--values", "1", "-r", "NA", file1Path],
1513                    file1,
1514                    [["fld3", "fld2", "fld1_values"],
1515                     ["3",  "a",  "a"],
1516                     ["2b", "a",  "c"],
1517                     ["",   "bc", "c|NA"],
1518                     ["2b", "c",  "a"],
1519                     ["3",  "bc", "c"]]
1520         );
1521     testSummarizer(["unittest-mis-14", "-H", "--group-by", "2,1,3", "--values", "2", "-r", "NA", file1Path],
1522                    file1,
1523                    [["fld2", "fld1", "fld3", "fld2_values"],
1524                     ["a",  "a", "3",  "a"],
1525                     ["a",  "c", "2b", "a"],
1526                     ["bc", "c", "",   "bc"],
1527                     ["c",  "a", "2b", "c"],
1528                     ["bc", "",  "",   "bc"],
1529                     ["bc", "c", "3",  "bc"]]
1530         );
1531 
1532     /* Validate that the no-key summarizer works with testSummarizer helper function.
1533      */
1534     testSummarizer(["unittest-nk-1", "-H", "--values", "1,2", file1Path],
1535                    file1,
1536                    [["fld1_values", "fld2_values"],
1537                     ["a|c|c|a||c", "a|a|bc|c|bc|bc"]]
1538         );
1539     testSummarizer(["unittest-nk-1-named", "-H", "--values", "fld1,fld2", file1Path],
1540                    file1,
1541                    [["fld1_values", "fld2_values"],
1542                     ["a|c|c|a||c", "a|a|bc|c|bc|bc"]]
1543         );
1544 
1545     /* Header variations: no header line; auto-generated header line; custom headers.
1546      */
1547     testSummarizer(["unittest-hdr-1", "--group-by", "1", "--values", "1", file1NoHeaderPath],
1548                    file1[1..$],
1549                    [["a", "a|a"],
1550                     ["c", "c|c|c"],
1551                     ["",  ""]]
1552         );
1553     testSummarizer(["unittest-hdr-2", "--group-by", "1,2", "--values", "2", file1NoHeaderPath],
1554                    file1[1..$],
1555                    [["a", "a",  "a"],
1556                     ["c", "a",  "a"],
1557                     ["c", "bc", "bc|bc"],
1558                     ["a", "c",  "c"],
1559                     ["", "bc",  "bc"]]
1560         );
1561     testSummarizer(["unittest-hdr-3", "--write-header", "--group-by", "2", "--values", "1", file1NoHeaderPath],
1562                    file1[1..$],
1563                    [["field2", "field1_values"],
1564                     ["a",  "a|c"],
1565                     ["bc", "c||c"],
1566                     ["c",  "a"]]
1567         );
1568     testSummarizer(["unittest-hdr-4", "-w", "--group-by", "3,2", "--values", "1", file1NoHeaderPath],
1569                    file1[1..$],
1570                    [["field3", "field2", "field1_values"],
1571                     ["3",  "a",  "a"],
1572                     ["2b", "a",  "c"],
1573                     ["",   "bc", "c|"],
1574                     ["2b", "c",  "a"],
1575                     ["3",  "bc", "c"]]
1576         );
1577     testSummarizer(["unittest-hdr-5", "-H", "--group-by", "2", "--values", "3:Field3Values", file1Path],
1578                    file1,
1579                    [["fld2", "Field3Values"],
1580                     ["a",  "3|2b"],
1581                     ["bc", "||3"],
1582                     ["c",  "2b"]]
1583         );
1584     testSummarizer(["unittest-hdr-6", "-H", "--group-by", "1,2", "--values", "3:FieldThreeValues", "--values", "1:FieldOneValues", file1Path],
1585                    file1,
1586                    [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"],
1587                     ["a", "a",  "3", "a"],
1588                     ["c", "a",  "2b", "c"],
1589                     ["c", "bc", "|3", "c|c"],
1590                     ["a", "c",  "2b", "a"],
1591                     ["",  "bc", "",   ""]]
1592         );
1593     testSummarizer(["unittest-hdr-6-named-a", "-H", "--group-by", "fld1,fld2", "--values", "fld3:FieldThreeValues", "--values", "fld1:FieldOneValues", file1Path],
1594                    file1,
1595                    [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"],
1596                     ["a", "a",  "3", "a"],
1597                     ["c", "a",  "2b", "c"],
1598                     ["c", "bc", "|3", "c|c"],
1599                     ["a", "c",  "2b", "a"],
1600                     ["",  "bc", "",   ""]]
1601         );
1602     testSummarizer(["unittest-hdr-6-named-b", "-H", "--group-by", "fld1,fld2", "--values", "fld3 FieldThreeValues", "--values", "fld1 FieldOneValues", file1Path],
1603                    file1,
1604                    [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"],
1605                     ["a", "a",  "3", "a"],
1606                     ["c", "a",  "2b", "c"],
1607                     ["c", "bc", "|3", "c|c"],
1608                     ["a", "c",  "2b", "a"],
1609                     ["",  "bc", "",   ""]]
1610         );
1611     testSummarizer(["unittest-hdr-7", "--write-header", "--group-by", "1", "--values", "3:f3_vals","--values", "2:f2_vals", "--values", "1:f1_vals", file1NoHeaderPath],
1612                    file1[1..$],
1613                    [["field1", "f3_vals", "f2_vals", "f1_vals"],
1614                     ["a", "3|2b",  "a|c",     "a|a"],
1615                     ["c", "2b||3", "a|bc|bc", "c|c|c"],
1616                     ["",  "",      "bc",      ""]]
1617         );
1618     testSummarizer(["unittest-hdr-8", "--write-header", "--group-by", "1,3,2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2", file1NoHeaderPath],
1619                    file1[1..$],
1620                    [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"],
1621                     ["a", "3",  "a",  "3",  "a", "a"],
1622                     ["c", "2b", "a",  "2b", "c", "a"],
1623                     ["c", "",   "bc", "",   "c", "bc"],
1624                     ["a", "2b", "c",  "2b", "a", "c"],
1625                     ["",  "",   "bc", "",   "",  "bc"],
1626                     ["c", "3",  "bc", "3",  "c", "bc"]]
1627         );
1628     testSummarizer(["unittest-hdr-9", "--write-header", "--group-by", "1,3-2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2", file1NoHeaderPath],
1629                    file1[1..$],
1630                    [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"],
1631                     ["a", "3",  "a",  "3",  "a", "a"],
1632                     ["c", "2b", "a",  "2b", "c", "a"],
1633                     ["c", "",   "bc", "",   "c", "bc"],
1634                     ["a", "2b", "c",  "2b", "a", "c"],
1635                     ["",  "",   "bc", "",   "",  "bc"],
1636                     ["c", "3",  "bc", "3",  "c", "bc"]]
1637         );
1638 
1639     /* Alternate file widths and lengths.
1640      */
1641 
1642     auto file3x2 = [["fld1", "fld2", "fld3"],
1643                     ["a", "b", "c"],
1644                     ["c", "b", "a"]];
1645 
1646     auto file3x2Path = buildPath(testDir, "file3x2.tsv");
1647     auto file3x2NoHeaderPath = buildPath(testDir, "file3x2_noheader.tsv");
1648     writeDataFile(file3x2Path, file3x2);
1649     writeDataFile(file3x2NoHeaderPath, file3x2[1 .. $]);
1650 
1651     testSummarizer(["unittest-3x2-1", "-H", "--group-by", "1", "--values", "3", file3x2Path],
1652                    file3x2,
1653                    [["fld1", "fld3_values"],
1654                     ["a", "c"],
1655                     ["c", "a"]]
1656         );
1657     testSummarizer(["unittest-3x2-2", "-H", "--group-by", "2", "--values", "3", file3x2Path],
1658                    file3x2,
1659                    [["fld2", "fld3_values"],
1660                     ["b", "c|a"]]
1661         );
1662     testSummarizer(["unittest-3x2-3", "-H", "--group-by", "2,1", "--values", "3", file3x2Path],
1663                    file3x2,
1664                    [["fld2", "fld1", "fld3_values"],
1665                     ["b", "a", "c"],
1666                     ["b", "c", "a"]]
1667         );
1668 
1669     auto file3x1 = [["fld1", "fld2", "fld3"],
1670                     ["a", "b", "c"]];
1671 
1672     auto file3x1Path = buildPath(testDir, "file3x1.tsv");
1673     auto file3x1NoHeaderPath = buildPath(testDir, "file3x1_noheader.tsv");
1674     writeDataFile(file3x1Path, file3x1);
1675     writeDataFile(file3x1NoHeaderPath, file3x1[1 .. $]);
1676 
1677     testSummarizer(["unittest-3x1-1", "-H", "--group-by", "1", "--values", "3", file3x1Path],
1678                    file3x1,
1679                    [["fld1", "fld3_values"],
1680                     ["a", "c"]]
1681         );
1682     testSummarizer(["unittest-3x1-2", "--group-by", "1", "--values", "3", file3x1NoHeaderPath],
1683                    file3x1[1..$],
1684                    [["a", "c"]]
1685         );
1686     testSummarizer(["unittest-3x1-3", "-H", "--group-by", "2,1", "--values", "3", file3x1Path],
1687                    file3x1,
1688                    [["fld2", "fld1", "fld3_values"],
1689                     ["b", "a", "c"]]
1690         );
1691     testSummarizer(["unittest-3x1-3-named", "-H", "--group-by", "fld2,fld1", "--values", "fld3", file3x1Path],
1692                    file3x1,
1693                    [["fld2", "fld1", "fld3_values"],
1694                     ["b", "a", "c"]]
1695         );
1696     testSummarizer(["unittest-3x1-4", "--group-by", "2,1", "--values", "3", file3x1NoHeaderPath],
1697                    file3x1[1..$],
1698                    [["b", "a", "c"]]
1699         );
1700 
1701     auto file3x0 = [["fld1", "fld2", "fld3"]];
1702 
1703     auto file3x0Path = buildPath(testDir, "file3x0.tsv");
1704     auto file3x0NoHeaderPath = buildPath(testDir, "file3x0_noheader.tsv");
1705     writeDataFile(file3x0Path, file3x0);
1706     writeDataFile(file3x0NoHeaderPath, file3x0[1 .. $]);
1707 
1708 
1709     testSummarizer(["unittest-3x0-1", "-H", "--group-by", "1", "--values", "3", file3x0Path],
1710                    file3x0,
1711                    [["fld1", "fld3_values"]]
1712         );
1713     testSummarizer(["unittest-3x0-1-named", "-H", "--group-by", "fld1", "--values", "fld3", file3x0Path],
1714                    file3x0,
1715                    [["fld1", "fld3_values"]]
1716         );
1717     testSummarizer(["unittest-3x0-2", "--group-by", "1", "--values", "3", file3x0NoHeaderPath],
1718                    file3x0[1..$],
1719                    []
1720         );
1721     testSummarizer(["unittest-3x0-3", "--write-header", "--group-by", "1", "--values", "3", file3x0NoHeaderPath],
1722                    file3x0[1..$],
1723                    [["field1", "field3_values"]]
1724         );
1725 
1726 
1727     testSummarizer(["unittest-3x0-4", "-H", "--group-by", "2,1", "--values", "3", file3x0Path],
1728                    file3x0,
1729                    [["fld2", "fld1", "fld3_values"]]
1730         );
1731 
1732     testSummarizer(["unittest-3x0-5", "--group-by", "2,1", "--values", "3", file3x0NoHeaderPath],
1733                    file3x0[1..$],
1734                    []
1735         );
1736 
1737     testSummarizer(["unittest-3x0-6", "--write-header", "--group-by", "2,1", "--values", "3", file3x0NoHeaderPath],
1738                    file3x0[1..$],
1739                    [["field2", "field1", "field3_values"]]
1740         );
1741 
1742     auto file2x1 = [["fld1", "fld2"],
1743                     ["a", "b"]];
1744 
1745     auto file2x1Path = buildPath(testDir, "file2x1.tsv");
1746     auto file2x1NoHeaderPath = buildPath(testDir, "file2x1_noheader.tsv");
1747     writeDataFile(file2x1Path, file2x1);
1748     writeDataFile(file2x1NoHeaderPath, file2x1[1 .. $]);
1749 
1750     testSummarizer(["unittest-2x1-1", "-H", "--group-by", "1", "--values", "2", file2x1Path],
1751                    file2x1,
1752                    [["fld1", "fld2_values"],
1753                     ["a", "b"]]
1754         );
1755     testSummarizer(["unittest-2x1-2", "-H", "--group-by", "2,1", "--values", "1", file2x1Path],
1756                    file2x1,
1757                    [["fld2", "fld1", "fld1_values"],
1758                     ["b", "a", "a"]]
1759         );
1760 
1761     auto file2x0 = [["fld1", "fld2"]];
1762 
1763     auto file2x0Path = buildPath(testDir, "file2x0.tsv");
1764     auto file2x0NoHeaderPath = buildPath(testDir, "file2x0_noheader.tsv");
1765     writeDataFile(file2x0Path, file2x0);
1766     writeDataFile(file2x0NoHeaderPath, file2x0[1 .. $]);
1767 
1768     testSummarizer(["unittest-2x0-1", "-H", "--group-by", "1", "--values", "2", file2x0Path],
1769                    file2x0,
1770                    [["fld1", "fld2_values"]]
1771         );
1772     testSummarizer(["unittest-2x0-2", "-H", "--group-by", "2,1", "--values", "1", file2x0Path],
1773                    file2x0,
1774                    [["fld2", "fld1", "fld1_values"]]
1775         );
1776 
1777     auto file1x2 = [["fld1"],
1778                     ["a"],
1779                     [""]];
1780 
1781     auto file1x2Path = buildPath(testDir, "file1x2.tsv");
1782     auto file1x2NoHeaderPath = buildPath(testDir, "file1x2_noheader.tsv");
1783     writeDataFile(file1x2Path, file1x2);
1784     writeDataFile(file1x2NoHeaderPath, file1x2[1 .. $]);
1785 
1786     testSummarizer(["unittest-1x2-1", "-H", "--group-by", "1", "--values", "1", file1x2Path],
1787                    file1x2,
1788                    [["fld1", "fld1_values"],
1789                     ["a", "a"],
1790                     ["",  ""]]
1791         );
1792 
1793     auto file1x2b = [["fld1"],
1794                      [""],
1795                      [""]];
1796 
1797     auto file1x2bPath = buildPath(testDir, "file1x2b.tsv");
1798     auto file1x2bNoHeaderPath = buildPath(testDir, "file1x2b_noheader.tsv");
1799     writeDataFile(file1x2bPath, file1x2b);
1800     writeDataFile(file1x2bNoHeaderPath, file1x2b[1 .. $]);
1801 
1802     testSummarizer(["unittest-1x2b-2", "-H", "--group-by", "1", "--values", "1", file1x2bPath],
1803                    file1x2b,
1804                    [["fld1", "fld1_values"],
1805                     ["", "|"]]
1806         );
1807 
1808     auto file1x1 = [["fld1"],
1809                     ["x"]];
1810 
1811     auto file1x1Path = buildPath(testDir, "file1x1.tsv");
1812     auto file1x1NoHeaderPath = buildPath(testDir, "file1x1_noheader.tsv");
1813     writeDataFile(file1x1Path, file1x1);
1814     writeDataFile(file1x1NoHeaderPath, file1x1[1 .. $]);
1815 
1816     testSummarizer(["unittest-1x1-1", "-H", "--group-by", "1", "--values", "1", file1x1Path],
1817                    file1x1,
1818                    [["fld1", "fld1_values"],
1819                     ["x", "x"]]
1820         );
1821     testSummarizer(["unittest-1x1-1-named", "-H", "--group-by", "fld1", "--values", "fld1", file1x1Path],
1822                    file1x1,
1823                    [["fld1", "fld1_values"],
1824                     ["x", "x"]]
1825         );
1826 
1827     testSummarizer(["unittest-1x1-2", "--group-by", "1", "--values", "1", file1x1NoHeaderPath],
1828                    file1x1[1..$],
1829                    [["x", "x"]]
1830         );
1831 
1832     testSummarizer(["unittest-1x1-3", "--write-header", "--group-by", "1", "--values", "1", file1x1NoHeaderPath],
1833                    file1x1[1..$],
1834                    [["field1", "field1_values"],
1835                     ["x", "x"]]
1836         );
1837 
1838     auto file1x1b = [["fld1"],
1839                     [""]];
1840 
1841     auto file1x1bPath = buildPath(testDir, "file1x1b.tsv");
1842     auto file1x1bNoHeaderPath = buildPath(testDir, "file1x1b_noheader.tsv");
1843     writeDataFile(file1x1bPath, file1x1b);
1844     writeDataFile(file1x1bNoHeaderPath, file1x1b[1 .. $]);
1845 
1846     testSummarizer(["unittest-1x1b-1", "-H", "--group-by", "1", "--values", "1", file1x1bPath],
1847                    file1x1b,
1848                    [["fld1", "fld1_values"],
1849                     ["", ""]]
1850         );
1851 
1852     auto file1x0 = [["fld1"]];
1853 
1854     auto file1x0Path = buildPath(testDir, "file1x0.tsv");
1855     auto file1x0NoHeaderPath = buildPath(testDir, "file1x0_noheader.tsv");
1856     writeDataFile(file1x0Path, file1x0);
1857     writeDataFile(file1x0NoHeaderPath, file1x0[1 .. $]);
1858 
1859     testSummarizer(["unittest-1x0-1", "-H", "--group-by", "1", "--values", "1", file1x0Path],
1860                    file1x0,
1861                    [["fld1", "fld1_values"]]
1862         );
1863 
1864     testSummarizer(["unittest-1x0-2", "--group-by", "1", "--values", "1", file1x0NoHeaderPath],
1865                    file1x0[1..$],
1866                    []
1867         );
1868 
1869     testSummarizer(["unittest-1x0-3", "--write-header", "--group-by", "1", "--values", "1", file1x0NoHeaderPath],
1870                    file1x0[1..$],
1871                    [["field1", "field1_values"]]
1872         );
1873 
1874     /* Alternate delimiters.
1875      *
1876      * Note: In current unit test setup the data is already in memory (file1).
1877      * 'file1Path' points to a file with equivalent data, but not read, except if
1878      * processing the header line. A data file is created for the '%' and '#'
1879      * delimiter cases (these read the header), but we don't bother for the others.
1880      */
1881     auto file1PctDelimPath = buildPath(testDir, "file1PctDelim.tsv");
1882     auto file1HashDelimPath = buildPath(testDir, "file1HashDelim.tsv");
1883     writeDataFile(file1PctDelimPath, file1, "%");
1884     writeDataFile(file1HashDelimPath, file1, "#");
1885 
1886     testSummarizer(["unittest-delim-1", "-H", "--values", "1,2", "--delimiter", "%", file1PctDelimPath],
1887                    file1,
1888                    [["fld1_values", "fld2_values"],
1889                     ["a|c|c|a||c", "a|a|bc|c|bc|bc"]]
1890         );
1891     testSummarizer(["unittest-delim-1-named", "-H", "--values", "fld1,fld2", "--delimiter", "%", file1PctDelimPath],
1892                    file1,
1893                    [["fld1_values", "fld2_values"],
1894                     ["a|c|c|a||c", "a|a|bc|c|bc|bc"]]
1895         );
1896     testSummarizer(["unittest-delim-2", "-H", "--values", "1-2", "--values-delimiter", "$", file1Path],
1897                    file1,
1898                    [["fld1_values", "fld2_values"],
1899                     ["a$c$c$a$$c", "a$a$bc$c$bc$bc"]]
1900         );
1901     testSummarizer(["unittest-delim-3", "-H", "--values", "1,2", "--delimiter", "#", "--values-delimiter", ",", file1HashDelimPath],
1902                    file1,
1903                    [["fld1_values", "fld2_values"],
1904                     ["a,c,c,a,,c", "a,a,bc,c,bc,bc"]]
1905         );
1906     testSummarizer(["unittest-delim-3-named", "-H", "--values", "fld1,fld2", "--delimiter", "#", "--values-delimiter", ",", file1HashDelimPath],
1907                    file1,
1908                    [["fld1_values", "fld2_values"],
1909                     ["a,c,c,a,,c", "a,a,bc,c,bc,bc"]]
1910         );
1911     testSummarizer(["unittest-delim-4", "--write-header", "--group-by", "2", "--values", "1",
1912                     "--delimiter", "^", "--values-delimiter", ":", file1NoHeaderPath],
1913                    file1[1..$],
1914                    [["field2", "field1_values"],
1915                     ["a",  "a:c"],
1916                     ["bc", "c::c"],
1917                     ["c",  "a"]]
1918         );
1919     testSummarizer(["unittest-delim-5", "--group-by", "1,2", "--values", "2", "--delimiter", "/",
1920                     "--values-delimiter", "\\", file1NoHeaderPath],
1921                    file1[1..$],
1922                    [["a", "a",  "a"],
1923                     ["c", "a",  "a"],
1924                     ["c", "bc", "bc\\bc"],
1925                     ["a", "c",  "c"],
1926                     ["", "bc",  "bc"]]
1927         );
1928 }
1929 
1930 /* Summary Operators and Calculators
1931  *
1932  * Two types of objects are used in implementation: Operators and Calculators. An Operator
1933  * represents a summary calculation specified on the command line, e.g. '--mean 5'. A
1934  * Calculator is used to manage the summary calculation for each unique key in the input.
1935  *
1936  * As an example, consider the command:
1937  *
1938  *    $tsv-summarize --group-by 1 --mean 3 --mean 5
1939  *
1940  * This command will create two instances of a MeanOperator, one each for fields 3 and 5.
1941  * They produce the output field headers (e.g. "field3_mean", "field5_mean"). They also
1942  * create MeanCalculator objects for each unique value in field 1. For 'mean', a
1943  * calculator needs to track occurrence count and sum. Calculators produce the final
1944  * value when all processing is finished.
1945  *
1946  * Summary field headers
1947  *
1948  * There are several options for specifying summary field headers. The defaults combine the
1949  * operator name and the header of the field summarized. The defaults can be overridden on
1950  * on the command line. These scenarios are supported via the operator constructor and the
1951  * processHeaderLine() method.
1952  *
1953  * Missing field policy
1954  *
1955  * At present, tsv-summarize has a single policy for handling missing values that applies
1956  * to all operators. However, it is logically operator specific and is implemented that
1957  * way. The MissingFieldPolicy struct describes the policy, each operator contains one.
1958  * Calculators access thier operator's policy struct.
1959  */
1960 
1961 /** An Operator represents a summary calculation specified on the command line.
1962  *  e.g. '--mean 5'.
1963  */
1964 interface Operator
1965 {
1966     @property string header();
1967     @property string name();
1968     void processHeaderLine(const char[][] fields);
1969     size_t[] numericFieldsToSave();     // Numeric fields this Operator needs saved
1970     size_t[] textFieldsToSave();        // Text fields this Operator needs saved
1971     Calculator makeCalculator();
1972 }
1973 
1974 /** Calculators are responsible for the calculation of a single computation. They
1975  *  process each line and produce the final value when all processing is finished.
1976  */
1977 interface Calculator
1978 {
1979     void processNextLine(const char[][] fields);
1980     string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions);
1981 }
1982 
1983 /** This class describes processing behavior when a missing value is encountered.
1984  */
1985 final class MissingFieldPolicy
1986 {
1987     private bool _useMissing = true;          // True if missing values are processed unchanged.
1988     private bool _replaceMissing = false;     // True if missing values are replaced.
1989     private string _missingReplacement;       // Replacement string if replaceMissing is true.
1990 
1991     this (const bool excludeMissing = false, string missingReplacement = "")
1992     {
1993         updatePolicy(excludeMissing, missingReplacement);
1994     }
1995 
1996     void updatePolicy(const bool excludeMissing, string missingReplacement)
1997     {
1998         _missingReplacement = missingReplacement;
1999         _replaceMissing = missingReplacement.length != 0;
2000         _useMissing = !excludeMissing && !replaceMissing;
2001     }
2002 
2003     final bool isMissingField(const char[] field) const
2004     {
2005         return field.length == 0;
2006     }
2007 
2008     final bool useMissing() const @property
2009     {
2010         return _useMissing;
2011     }
2012 
2013     final bool excludeMissing() const @property
2014     {
2015         return !_useMissing && !_replaceMissing;
2016     }
2017 
2018     final bool replaceMissing() const @property
2019     {
2020         return _replaceMissing;
2021     }
2022 
2023     final string missingReplacement() const @property
2024     {
2025         return _missingReplacement;
2026     }
2027 }
2028 
2029 /* The SharedFieldValues and UniqueKeyValuesLists classes manage lists of values collected
2030  * while reading data. Operations like median collect all values and operate on them when
2031  * running the final calculation. Value lists are needed for each unique key. A command
2032  * using multiple Operators may save multiple fields. And, different Operators may be run
2033  * against the same field.
2034  *
2035  * The last part motivates these classes. Handling large data sets necessitates minimizing
2036  * in-memory storage, making it desirable to share identical lists between Calculators.
2037  * Otherwise, each Calculator could implement its own storage, which would be simpler.
2038  *
2039  * The setup works as follows:
2040  *  - Operators advertise fields they need saved ([text|numeric]FieldsToSave methods).
2041  *  - The SummarizerBase object keeps a SharedFieldValues object, which in turn keeps list
2042  *    of the fields advertised by Operators as needing sharing. This list gets created
2043  *    during command initialization (SummarizerBase.setOperators).
2044  *  - The SharedFieldValues object is used to create a UniqueKeyValuesLists object every
2045  *    time a new unique key is found, in parellel to the Calculator objects created for the
2046  *    key. The UniqueKeyValuesLists objects are managed by derived Summarizer classes.
2047  *  - A unique key's UniqueKeyValuesLists object is passed each input line, same as
2048  *    Calculators, saving the values.
2049  *  - Calculators retrieve the saved values during the calculation phase. The calculator's
2050  *    ProcessNextField method is typically a no-op.
2051  *  - Calculators cannot make assumptions about the order of the saved values. This is
2052  *    pragmatic concession to median and quantile calculations, which need to sort the data,
2053  *    at least partially. Rather than generate sorted copies, the current algorithms
2054  *    sort the data in place.
2055  *
2056  * One concession to duplicate storage is that text and numeric versions of the same
2057  * field might be stored. The reason is because it's important to convert text to numbers
2058  * as they are read so that useful error messages can be generated. And, storing both
2059  * forms of the same field should be less common.
2060  *
2061  * The current implementation uses the same missing values policy for all fields. If
2062  * multiple policies become supported this will need to change.
2063  *
2064  * Built-in calculations - UniqueKeyValueLists have a built-in median operation. This is
2065  * to avoid repeated calculations of the median by different calculations.
2066  */
2067 
2068 final class SharedFieldValues
2069 {
2070     // Arrays with field indices that need to be saved.
2071     private size_t[] _numericFieldIndices;
2072     private size_t[] _textFieldIndices;
2073 
2074     /* Called during summarizer setup to add a shared field value for a specific field index.
2075      * eg. '--median 7' will add end up calling addNumericIdex(6), 6 being the zero-based index.
2076      * A specific index is only added once.
2077      */
2078     final void addNumericIndex (size_t index)
2079     {
2080         if (!canFind(_numericFieldIndices, index)) _numericFieldIndices ~= index;
2081     }
2082 
2083     /* Similar to addNumericIndex, except adds a text index. */
2084     final void addTextIndex (size_t index)
2085     {
2086         if (!canFind(_textFieldIndices, index)) _textFieldIndices ~= index;
2087     }
2088 
2089     /* Called every time a new key is found, or once at the beginning of the program if no keys
2090      * are being used (entire column summarized).
2091      */
2092     final UniqueKeyValuesLists makeUniqueKeyValuesLists()
2093     {
2094         return new UniqueKeyValuesLists(_numericFieldIndices, _textFieldIndices);
2095     }
2096 }
2097 
2098 final class UniqueKeyValuesLists
2099 {
2100     /* A FieldValues object holds is a list of values collect for a specific field. A
2101      * unique key may hold several. For example, the command:
2102      *     $ tsv-summarize --k 1 --median 4 -- median 5
2103      * requires keeping lists for both fields 4 and 5. This in turn will result in a
2104      * _numericFieldValues being a 2 element array, one with a list of field 4 values,
2105      * the second of field 5 values. Linear search is used to find a specific field.
2106      */
2107     private FieldValues!double[] _numericFieldValues;
2108     private FieldValues!string[] _textFieldValues;
2109     private double[] _numericFieldMedians;
2110 
2111     /* The UniqueKeyValuesLists constructor takes arrays of field indices to be saved. */
2112     this(const size_t[] numericFieldIndices, const size_t[] textFieldIndices)
2113     {
2114         if (numericFieldIndices.length > 0)
2115         {
2116             _numericFieldValues = new FieldValues!double[](numericFieldIndices.length);
2117             foreach (i, fieldIndex; numericFieldIndices)
2118                 _numericFieldValues[i] = new FieldValues!double(fieldIndex);
2119         }
2120 
2121         if (textFieldIndices.length > 0)
2122         {
2123             _textFieldValues = new FieldValues!string[](textFieldIndices.length);
2124             foreach (i, fieldIndex; textFieldIndices)
2125                 _textFieldValues[i] = new FieldValues!string(fieldIndex);
2126         }
2127     }
2128 
2129     void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy)
2130     {
2131         _numericFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy));
2132         _textFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy));
2133     }
2134 
2135     private FieldValues!double findNumericFieldValues(size_t index)
2136     {
2137         alias pred = (FieldValues!double a, size_t b) => (a.fieldIndex == b);
2138         auto r = find!pred(_numericFieldValues, index);
2139         assert(!r.empty);
2140         return r.front;
2141     }
2142 
2143     private FieldValues!string findTextFieldValues(size_t index)
2144     {
2145         alias pred = (FieldValues!string a, size_t b) => (a.fieldIndex == b);
2146         auto r = find!pred(_textFieldValues, index);
2147         assert(!r.empty);
2148         return r.front;
2149     }
2150 
2151     final double[] numericValues(size_t index)
2152     {
2153         return findNumericFieldValues(index).getArray;
2154     }
2155 
2156     final double[] numericValuesSorted(size_t index)
2157     {
2158         return findNumericFieldValues(index).getSortedArray;
2159     }
2160 
2161     final string[] textValues(size_t index)
2162     {
2163         return findTextFieldValues(index).getArray;
2164     }
2165 
2166     final string[] textValuesSorted(size_t index)
2167     {
2168         return findTextFieldValues(index).getSortedArray;
2169     }
2170 
2171     final double numericValuesMedian(size_t index)
2172     {
2173         return findNumericFieldValues(index).median;
2174     }
2175 
2176     private final class FieldValues(ValueType)
2177     {
2178         import std.array : appender;
2179         private size_t _fieldIndex;
2180         private Appender!(ValueType[]) _values;
2181         private bool _haveMedian = false;
2182         private bool _isSorted = false;
2183         private ValueType _medianValue;
2184 
2185         this(size_t fieldIndex)
2186         {
2187             _fieldIndex = fieldIndex;
2188         }
2189 
2190         final size_t length() const @property
2191         {
2192             return _values.data.length;
2193         }
2194 
2195         final size_t fieldIndex() const @property
2196         {
2197             return _fieldIndex;
2198         }
2199 
2200         final void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy)
2201         {
2202             debug writefln("[%s]: %s", __FUNCTION__, fields.to!string);
2203 
2204             const char[] field = fields[_fieldIndex];
2205             if (missingPolicy.useMissing || !missingPolicy.isMissingField(field))
2206             {
2207                 _values.put(field.to!ValueType);
2208                 _haveMedian = false;
2209                 _isSorted = false;
2210             }
2211             else if (missingPolicy.replaceMissing)
2212             {
2213                 _values.put(missingPolicy.missingReplacement.to!ValueType);
2214                 _haveMedian = false;
2215                 _isSorted = false;
2216             }
2217         }
2218 
2219         /* Return an input range of the values. */
2220         final auto values()
2221         {
2222             return _values.data;
2223         }
2224 
2225         final ValueType[] getArray()
2226         {
2227             return _values.data;
2228         }
2229 
2230         final ValueType[] getSortedArray()
2231         {
2232             if (!_isSorted)
2233             {
2234                 import std.algorithm : sort;
2235                 sort(_values.data);
2236                 _isSorted = true;
2237             }
2238             return _values.data;
2239         }
2240 
2241         final ValueType median()
2242         {
2243             if (!_haveMedian)
2244             {
2245                 import tsv_utils.common.numerics : rangeMedian;
2246                 _medianValue = _values.data.rangeMedian();
2247                 _haveMedian = true;
2248             }
2249 
2250             return _medianValue;
2251         }
2252     }
2253 }
2254 
2255 /** SingleFieldOperator is a base class for single field operators, the most common
2256  * Operator. Derived classes implement makeCalculator and the Calculator class it returns.
2257  */
2258 class SingleFieldOperator : Operator
2259 {
2260     import std.typecons : Flag;
2261 
2262     private string _name;
2263     private string _header;
2264     private size_t _fieldIndex;
2265     private bool _useHeaderSuffix;
2266     private bool _allowCustomHeader;
2267     private bool _hasCustomHeader = false;
2268     private size_t[] _numericFieldsToSave;
2269     private size_t[] _textFieldsToSave;
2270     private MissingFieldPolicy _missingPolicy;
2271 
2272     this(string operatorName, size_t fieldIndex, MissingFieldPolicy missingPolicy,
2273          Flag!"useHeaderSuffix" useHeaderSuffix = Yes.useHeaderSuffix,
2274          Flag!"allowCustomHeader" allowCustomHeader = Yes.allowCustomHeader)
2275     {
2276         _name = operatorName;
2277         _fieldIndex = fieldIndex;
2278         _missingPolicy = missingPolicy;
2279         _useHeaderSuffix = useHeaderSuffix;
2280         _allowCustomHeader = allowCustomHeader;
2281         // Default header. May be overrridden by custom header or header line.
2282         _header =
2283             fieldHeaderFromIndex(fieldIndex)
2284             .summaryHeaderFromFieldHeader(_useHeaderSuffix ? operatorName : "");
2285     }
2286 
2287     void setCustomHeader (string customHeader)
2288     {
2289         assert(_allowCustomHeader);
2290         _header = customHeader;
2291         _hasCustomHeader = true;
2292     }
2293 
2294     final string name() const @property
2295     {
2296         return _name;
2297     }
2298 
2299     final bool allowCustomHeader() const @property
2300     {
2301         return _allowCustomHeader;
2302     }
2303 
2304     /* saveFieldValues[Numeric|Text] are called by derived classes to indicate that field
2305      * that the field values should be saved. These should called during construction.
2306      */
2307     final void setSaveFieldValuesNumeric()
2308     {
2309         _numericFieldsToSave ~= _fieldIndex;
2310     }
2311 
2312     final void setSaveFieldValuesText()
2313     {
2314         _textFieldsToSave ~= _fieldIndex;
2315     }
2316 
2317     final MissingFieldPolicy missingPolicy() @property
2318     {
2319         return _missingPolicy;
2320     }
2321 
2322     final size_t fieldIndex() const @property
2323     {
2324         return _fieldIndex;
2325     }
2326 
2327     final string header() const @property
2328     {
2329         return _header;
2330     }
2331 
2332     final bool useHeaderSuffix() const @property
2333     {
2334         return _useHeaderSuffix;
2335     }
2336 
2337     void processHeaderLine(const char[][] fields)
2338     {
2339         if (!_hasCustomHeader) {
2340             debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string);
2341             _header = summaryHeaderFromFieldHeader(fields[_fieldIndex].to!string,
2342                                                    _useHeaderSuffix ? _name : "");
2343         }
2344     }
2345 
2346     final size_t[] numericFieldsToSave()
2347     {
2348         return _numericFieldsToSave;
2349     }
2350 
2351     final size_t[] textFieldsToSave()
2352     {
2353         return _textFieldsToSave;
2354     }
2355 
2356     abstract SingleFieldCalculator makeCalculator();
2357 }
2358 
2359 /** SingleFieldCalculator is a base class for the common case of calculators using a single
2360  * field. Derived classes implement processNextField() rather than processNextLine().
2361  */
2362 class SingleFieldCalculator : Calculator
2363 {
2364     private size_t _fieldIndex;
2365 
2366     this(size_t fieldIndex)
2367     {
2368         _fieldIndex = fieldIndex;
2369     }
2370 
2371     final size_t fieldIndex() const @property
2372     {
2373         return _fieldIndex;
2374     }
2375 
2376     final void processNextLine(const char[][] fields)
2377     {
2378         debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string);
2379 
2380         auto missingPolicy = getOperator.missingPolicy;
2381         const char[] field = fields[_fieldIndex];
2382 
2383         if (missingPolicy.useMissing || !missingPolicy.isMissingField(field))
2384         {
2385             processNextField(field);
2386         }
2387         else if (missingPolicy.replaceMissing)
2388         {
2389             processNextField(missingPolicy.missingReplacement);
2390         }
2391     }
2392 
2393     abstract SingleFieldOperator getOperator();
2394 
2395     abstract void processNextField(const char[] field);
2396 }
2397 
2398 /* Unittest helper functions. Only compiled when -unittest is in effect. */
2399 version(unittest)
2400 {
2401     /** A helper for SingleFieldOperator unit tests.
2402      *
2403      * testSingleFieldOperator takes a set of split file values, a field index, a header
2404      * suffix, and a set of expected values. The expected values array contains the
2405      * initial value (zero entries) and the expected values after each line. (One more
2406      * expected value than input lines.) The zero entry case is what is generated for an
2407      * empty file. An example testing the 'min' operator against a file with 2 columns,
2408      * 3 rows, using field index 1:
2409      *
2410      *    testSingleFieldOperator!MinOperator(
2411      *       [["10", "100"],               // The split file. 3 lines by 2 rows.
2412      *        ["5", "50"],
2413      *        ["20", "200"]],
2414      *       1,                            // Field index (zero-based, so "100", "50", "200")
2415      *       "min",                        // The header suffix, normally the operator name.
2416      *       ["nan", "100", "50", "50"]);  // Min value after processing each line.
2417      *
2418      * A typical operator unit test uses three "files", one each of 1x3, 2x3, and 3x3.
2419      * Then run the operator is tested against each column, a total of six calls. Headers
2420      * are automatically checked. Additional entries can be used to extend coverage.
2421      *
2422      * A non-default MissingFieldPolicy can be provide as an optional last argument.
2423      * Operator tests should include exclusion and replacement variations. See operator
2424      * unit tests for details.
2425      *
2426      * The testSingleFieldOperatorBase adds an additional capability - Custom operator
2427      * init arguments. Currently this is used only by the quantile operator.
2428      *
2429      * These tests do not check unique key behavior (group-by). Operators don't have info
2430      * about unique keys, and interact with them only indirectly, via Calculators.
2431      */
2432     void testSingleFieldOperator(OperatorClass : SingleFieldOperator)
2433         (const char[][][] splitFile, size_t fieldIndex, string headerSuffix,
2434          const char[][] expectedValues,
2435          MissingFieldPolicy missingPolicy = new MissingFieldPolicy)
2436     {
2437         testSingleFieldOperatorBase!OperatorClass(splitFile, fieldIndex, headerSuffix, expectedValues, missingPolicy);
2438     }
2439 
2440     void testSingleFieldOperatorBase(OperatorClass : SingleFieldOperator, T...)
2441         (const char[][][] splitFile, size_t fieldIndex, string headerSuffix,
2442          const char[][] expectedValues,
2443          MissingFieldPolicy missingPolicy,
2444          T extraOpInitArgs)
2445     {
2446         import std.format : format;
2447         import std.array : appender;
2448         import std.string : chomp;
2449         import std.traits : EnumMembers;
2450 
2451         auto numFields = (splitFile[0]).length;
2452 
2453         assert(fieldIndex < numFields,
2454                format("[testSingleFieldOperator] Invalid field index. headerSuffix: %s",
2455                       headerSuffix));
2456         assert(splitFile.length + 1 == expectedValues.length,
2457                format("[testSingleFieldOperator] Need one more expected value than number of rows. headerSuffix: %s",
2458                       headerSuffix));
2459 
2460         /* printOptions - Only the 'values-delimiter' (2nd arg) is used these tests. */
2461         auto printOptions = SummarizerPrintOptions('#', '|');
2462 
2463         /* An input header line. */
2464         string[] inputHeaderLine = new string[numFields];
2465         foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string;
2466 
2467         /* The different expected output field headers. */
2468         auto outputFieldHeaderWithNoHeaderLine =
2469             fieldHeaderFromIndex(fieldIndex)
2470             .summaryHeaderFromFieldHeader(headerSuffix);
2471         auto outputFieldHeaderFromHeaderLine =
2472             inputHeaderLine[fieldIndex]
2473             .summaryHeaderFromFieldHeader(headerSuffix);
2474         auto customOutputFieldHeader = "custom";
2475 
2476         enum HeaderUsecase {
2477             HeaderLine_DefaultHeader,
2478             HeaderLine_CustomHeader,
2479             NoHeaderLine_DefaultHeader,
2480             NoHeaderLine_CustomHeader,
2481             NoHeaderLine_NoOutputHeader,
2482         }
2483 
2484         string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected)
2485         {
2486             return format("[testSingleFieldOperator] Unexpected header. Operator: %s; Usecase: %s;  Actual: '%s';  Expected: '%s'",
2487                           op.name, hc, actual, expected);
2488         }
2489 
2490         string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex, size_t fieldIndex,
2491                                   const char[] actual, const char[] expected)
2492         {
2493             return format("[testSingleFieldOperator] Unexpected value. Operator: %s; Usecase: %s;  RowIndex: %d, FieldIndex: %d\n    Actual: '%s';  Expected: '%s'",
2494                           op.name, hc, rowIndex, fieldIndex, actual, expected);
2495         }
2496 
2497         /* Run the logic for each header use case. */
2498         foreach (hc; EnumMembers!HeaderUsecase)
2499         {
2500             bool hasInputHeader = (
2501                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2502                 hc == HeaderUsecase.HeaderLine_CustomHeader
2503                 );
2504             bool hasOutputHeader = (
2505                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2506                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2507                 hc == HeaderUsecase.NoHeaderLine_DefaultHeader ||
2508                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2509                 );
2510             bool hasCustomHeader = (
2511                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2512                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2513                 );
2514 
2515             if (hasCustomHeader) assert(hasOutputHeader);
2516 
2517             auto op = new OperatorClass(fieldIndex, missingPolicy, extraOpInitArgs);
2518 
2519             if (hasCustomHeader)
2520             {
2521                 if (!op.allowCustomHeader) continue;   // Custom header not support by this operator
2522                 op.setCustomHeader(customOutputFieldHeader);
2523             }
2524 
2525             Operator[] operatorArray;
2526             operatorArray ~= op;
2527 
2528             auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy);
2529             summarizer.setOperators(inputRangeObject(operatorArray));
2530 
2531             if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine);
2532 
2533             if (hasOutputHeader)
2534             {
2535                 /* Write the header line. Note that this is a one-field header, */
2536                 auto headerLineOutput = appender!(char[])();
2537                 summarizer.writeSummaryHeader(headerLineOutput, printOptions);
2538 
2539                 /* Test that the header was generated correctly.
2540                  *
2541                  * Note: Because the output is generated by a Summarizer, it will have a
2542                  * trailing newline. Use chomp to trim it.
2543                  */
2544                 final switch (hc)
2545                 {
2546                 case HeaderUsecase.HeaderLine_DefaultHeader:
2547                     assert(headerLineOutput.data.chomp == outputFieldHeaderFromHeaderLine,
2548                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2549                                                outputFieldHeaderFromHeaderLine));
2550                     break;
2551                 case HeaderUsecase.NoHeaderLine_DefaultHeader:
2552                     assert(headerLineOutput.data.chomp == outputFieldHeaderWithNoHeaderLine,
2553                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2554                                                outputFieldHeaderWithNoHeaderLine));
2555                     break;
2556                 case HeaderUsecase.HeaderLine_CustomHeader:
2557                 case HeaderUsecase.NoHeaderLine_CustomHeader:
2558                     assert(headerLineOutput.data.chomp == customOutputFieldHeader,
2559                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2560                                                customOutputFieldHeader));
2561                     break;
2562                 case HeaderUsecase.NoHeaderLine_NoOutputHeader:
2563                     break;
2564                }
2565 
2566             }
2567 
2568             /* For each line, process the line, generate the output, and test that the
2569              * value is correct. Start with the empty file case.
2570              */
2571             foreach (i, const char[] expected; expectedValues)
2572             {
2573                 if (i > 0) summarizer.processNextLine(splitFile[i - 1]);
2574                 auto summaryLineOutput = appender!(char[])();
2575                 summarizer.writeSummaryBody(summaryLineOutput, printOptions);
2576                 assert(summaryLineOutput.data.chomp == expected,
2577                        valueAssertMessage(operatorArray[0], hc, i, fieldIndex,
2578                                           summaryLineOutput.data.chomp, expectedValues[i]));
2579             }
2580         }
2581     }
2582 }
2583 
2584 /** ZeroFieldOperator is a base class for operators that take no input. The main use
2585  * case is the CountOperator, which counts the occurrences of each unique key. Other
2586  * uses are possible, for example, weighted random number assignment.
2587  *
2588  * The primary rationale for ZeroFieldOperator and ZeroFieldCalculator is to clarify
2589  * the information available to such a routine. In particular, the split fields passed
2590  * to processHeaderLine and processNextLine don't include all fields in the input,
2591  * something that might not be obvious when implementing an operator. (Only fields
2592  * required by operators acting on specific fields are included.)
2593  */
2594 class ZeroFieldOperator : Operator
2595 {
2596     import std.typecons : Flag;
2597 
2598     private string _name;
2599     private string _header;
2600 
2601     this(string operatorName)
2602     {
2603         _name = operatorName;
2604         _header = operatorName;
2605     }
2606 
2607     void setCustomHeader (string customHeader)
2608     {
2609         _header = customHeader;
2610     }
2611 
2612     bool allowCustomHeader() const @property
2613     {
2614         return true;
2615     }
2616 
2617     final string name() const @property
2618     {
2619         return _name;
2620     }
2621 
2622     final string header() const @property
2623     {
2624         return _header;
2625     }
2626 
2627     /* A no-op. ZeroFieldOperators have no access to the header line. */
2628     final void processHeaderLine(const char[][] fields) { }
2629 
2630     /* A no-op. ZeroFieldOperators have no access to fields. */
2631     final size_t[] numericFieldsToSave()
2632     {
2633         size_t[] emptyArray;
2634         return emptyArray;
2635     }
2636 
2637     /* A no-op. ZeroFieldOperators have no access to fields. */
2638     final size_t[] textFieldsToSave()
2639     {
2640         size_t[] emptyArray;
2641         return emptyArray;
2642     }
2643 
2644     abstract ZeroFieldCalculator makeCalculator();
2645 }
2646 
2647 /** ZeroFieldCalculator is a base class for operators that don't use fields as input.
2648  * In particular, the Count operator. It is a companion to the ZeroFieldOperator class.
2649  *
2650  * Derived classes implement processNextEntry() rather than processNextLine(), and the
2651  * single argument form of calculate() given as an abstract function.
2652  */
2653 class ZeroFieldCalculator : Calculator
2654 {
2655     this() { }
2656 
2657     final void processNextLine(const char[][] fields)
2658     {
2659         debug writefln("[%s]", __FUNCTION__,);
2660         processNextEntry();
2661     }
2662 
2663     final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2664     {
2665         return calculate(printOptions);
2666     }
2667 
2668     abstract void processNextEntry();
2669     abstract string calculate(const ref SummarizerPrintOptions printOptions);
2670 }
2671 
2672 version(unittest)
2673 {
2674     /* A helper for ZeroFieldOperator unit tests.
2675      *
2676      * testZeroFieldOperator takes a set of split file values, a default header, and a
2677      * set of expected values. The expected values array contains the expected values
2678      * after each line.
2679      *
2680      * testZeroFieldOperator is very similar to testSingleFieldOperator, except that
2681      * there is no use of field indices and fewer types of headers. See the latter's
2682      * documentation and the CountOperator unit tests for examples.
2683      */
2684     void testZeroFieldOperator(OperatorClass : ZeroFieldOperator)
2685         (const char[][][] splitFile, string defaultHeader, const char[][] expectedValues)
2686     {
2687         import std.format : format;
2688         import std.array : appender;
2689         import std.string : chomp;
2690         import std.traits : EnumMembers;
2691 
2692         auto numFields = (splitFile[0]).length;
2693 
2694         assert(splitFile.length + 1 == expectedValues.length,
2695                format("[testZeroFieldOperator] Need one more expected value than number of rows. headerSuffix: %s",
2696                       defaultHeader));
2697 
2698         /* printOptions - Not used these tests, but needed for API calls. */
2699         auto printOptions = SummarizerPrintOptions('#', '|');
2700 
2701         /* Missing policy doesn't apply to zero field operators, but need the object for the summarizer. */
2702         auto missingPolicy = new MissingFieldPolicy;
2703 
2704         /* An input header line. */
2705         string[] inputHeaderLine = new string[numFields];
2706         foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string;
2707 
2708         auto customOutputFieldHeader = "custom";
2709 
2710         enum HeaderUsecase {
2711             HeaderLine_DefaultHeader,
2712             HeaderLine_CustomHeader,
2713             NoHeaderLine_DefaultHeader,
2714             NoHeaderLine_CustomHeader,
2715             NoHeaderLine_NoOutputHeader,
2716         }
2717 
2718         string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected)
2719         {
2720             return format("[testZeroFieldOperator] Unexpected header. Operator: %s; Usecase: %s;  Actual: '%s';  Expected: '%s'",
2721                           op.name, hc, actual, expected);
2722         }
2723 
2724         string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex,
2725                                   const char[] actual, const char[] expected)
2726         {
2727             return format("[testZeroFieldOperator] Unexpected value. Operator: %s; Usecase: %s;  RowIndex: %d\n    Actual: '%s';  Expected: '%s'",
2728                           op.name, hc, rowIndex, actual, expected);
2729         }
2730 
2731         /* Run the logic for each header use case. */
2732         foreach (hc; EnumMembers!HeaderUsecase)
2733         {
2734             bool hasInputHeader = (
2735                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2736                 hc == HeaderUsecase.HeaderLine_CustomHeader
2737                 );
2738             bool hasOutputHeader = (
2739                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2740                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2741                 hc == HeaderUsecase.NoHeaderLine_DefaultHeader ||
2742                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2743                 );
2744             bool hasCustomHeader = (
2745                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2746                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2747                 );
2748 
2749             if (hasCustomHeader) assert(hasOutputHeader);
2750 
2751             auto op = new OperatorClass();
2752 
2753             if (hasCustomHeader)
2754             {
2755                 if (!op.allowCustomHeader) continue;   // Custom header not support by this operator
2756                 op.setCustomHeader(customOutputFieldHeader);
2757             }
2758 
2759             Operator[] operatorArray;
2760             operatorArray ~= op;
2761 
2762             auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy);
2763             summarizer.setOperators(inputRangeObject(operatorArray));
2764             if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine);
2765 
2766             if (hasOutputHeader)
2767             {
2768                 /* Write the header line. Note that this is a one-field header, */
2769                 auto headerLineOutput = appender!(char[])();
2770                 summarizer.writeSummaryHeader(headerLineOutput, printOptions);
2771 
2772                 /* Test that the header was generated correctly.
2773                  *
2774                  * Note: Because the output is generated by a Summarizer, it will have a
2775                  * trailing newline. Use chomp to trim it.
2776                  */
2777                 final switch (hc)
2778                 {
2779                 case HeaderUsecase.HeaderLine_DefaultHeader:
2780                 case HeaderUsecase.NoHeaderLine_DefaultHeader:
2781                     assert(headerLineOutput.data.chomp == defaultHeader,
2782                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2783                                                defaultHeader));
2784                     break;
2785                 case HeaderUsecase.HeaderLine_CustomHeader:
2786                 case HeaderUsecase.NoHeaderLine_CustomHeader:
2787                     assert(headerLineOutput.data.chomp == customOutputFieldHeader,
2788                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2789                                                customOutputFieldHeader));
2790                     break;
2791                 case HeaderUsecase.NoHeaderLine_NoOutputHeader:
2792                     break;
2793                 }
2794 
2795             }
2796 
2797             /* For each line, process the line, generate the output, and test that the
2798              * value is correct. Start with the empty file case.
2799              */
2800             foreach (i, const char[] expected; expectedValues)
2801             {
2802                 if (i > 0) summarizer.processNextLine(splitFile[i - 1]);
2803                 auto summaryLineOutput = appender!(char[])();
2804                 summarizer.writeSummaryBody(summaryLineOutput, printOptions);
2805                 assert(summaryLineOutput.data.chomp == expected,
2806                        valueAssertMessage(operatorArray[0], hc, i,
2807                                           summaryLineOutput.data.chomp, expectedValues[i]));
2808             }
2809         }
2810     }
2811 }
2812 
2813 /* Specific operators.
2814  *
2815  * Notes:
2816  * - The 'Calculator' inner classes are 'static'. This means inner class instances do not
2817  *   keep a reference to the context of the outer class. In exchange, Calculator instances
2818  *   need to hold all needed state, typically the field index they are summarizing.
2819  */
2820 
2821 /** CountOperator counts the number of occurrences of each unique key, or the number of
2822  * input lines if there is no unique key.
2823  *
2824  * CountOperator differs from most other operators in that it doesn't summarize a specific
2825  * field on the line. Instead it is summarizing a property of the unique key itself. For
2826  * this reason it doesn't derive from SingleFieldOperator.
2827  */
2828 final class CountOperator : ZeroFieldOperator
2829 {
2830     this()
2831     {
2832         super("count");
2833     }
2834 
2835     final override ZeroFieldCalculator makeCalculator()
2836     {
2837         return new CountCalculator();
2838     }
2839 
2840     static final class CountCalculator : ZeroFieldCalculator
2841     {
2842         private size_t _count = 0;
2843 
2844         final override void processNextEntry()
2845         {
2846             _count++;
2847         }
2848 
2849         final override string calculate(const ref SummarizerPrintOptions printOptions)
2850         {
2851             return printOptions.formatNumber(_count);
2852         }
2853     }
2854 }
2855 
2856 unittest // CountOperator
2857 {
2858     auto col1File = [["10"], ["9.5"], ["11"]];
2859     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
2860     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
2861 
2862     testZeroFieldOperator!CountOperator(col1File, "count", ["0", "1", "2", "3"]);
2863     testZeroFieldOperator!CountOperator(col2File, "count", ["0", "1", "2", "3"]);
2864     testZeroFieldOperator!CountOperator(col3File, "count", ["0", "1", "2", "3"]);
2865 }
2866 
2867 /** RetainOperator retains the first occurrence of a field, without changing the header.
2868  *
2869  * RetainOperator is intended for fields where the value is expected to be the same for
2870  * all occurrences of the unique key, and the goal is to pass the value through unchanged.
2871  * It is like FirstOperator, except that the original header is preserved. The original
2872  * header preservation is setup in the call to the SingleFieldOperation constructor.
2873  *
2874  * Notes:
2875  * - An option to signal an error if multiple values are encountered might be useful.
2876  */
2877 final class RetainOperator : SingleFieldOperator
2878 {
2879     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2880     {
2881         super("retain", fieldIndex, missingPolicy, No.useHeaderSuffix, No.allowCustomHeader);
2882     }
2883 
2884     final override SingleFieldCalculator makeCalculator()
2885     {
2886         return new RetainCalculator(fieldIndex);
2887     }
2888 
2889     final class RetainCalculator : SingleFieldCalculator
2890     {
2891         private bool _done = false;
2892         private string _value = "";
2893 
2894         this(size_t fieldIndex)
2895         {
2896             super(fieldIndex);
2897         }
2898 
2899         final override RetainOperator getOperator()
2900         {
2901             return this.outer;
2902         }
2903 
2904         final override void processNextField(const char[] nextField)
2905         {
2906             if (!_done)
2907             {
2908                 _value = nextField.to!string;
2909                 _done = true;
2910             }
2911         }
2912 
2913         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2914        {
2915             return _value;
2916         }
2917     }
2918 }
2919 
2920 unittest // RetainOperator
2921 {
2922     auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]];
2923     auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]];
2924     auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]];
2925 
2926     testSingleFieldOperator!RetainOperator(col1File, 0, "", ["", "r1c1", "r1c1", "r1c1"]);
2927     testSingleFieldOperator!RetainOperator(col2File, 0, "", ["", "r1c1", "r1c1", "r1c1"]);
2928     testSingleFieldOperator!RetainOperator(col2File, 1, "", ["", "r1c2", "r1c2", "r1c2"]);
2929     testSingleFieldOperator!RetainOperator(col3File, 0, "", ["", "r1c1", "r1c1", "r1c1"]);
2930     testSingleFieldOperator!RetainOperator(col3File, 1, "", ["", "r1c2", "r1c2", "r1c2"]);
2931     testSingleFieldOperator!RetainOperator(col3File, 2, "", ["", "r1c3", "r1c3", "r1c3"]);
2932 
2933     auto col1misFile = [[""], ["r2c1"], ["r3c1"]];
2934     testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "", "r2c1", "r2c1"],
2935                                            new MissingFieldPolicy(true, ""));  // Exclude missing
2936     testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "NA", "NA", "NA"],
2937                                            new MissingFieldPolicy(false, "NA"));  // Replace missing
2938 }
2939 
2940 /** FirstOperator outputs the first value found for the field.
2941  */
2942 final class FirstOperator : SingleFieldOperator
2943 {
2944     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2945     {
2946         super("first", fieldIndex, missingPolicy);
2947     }
2948 
2949     final override SingleFieldCalculator makeCalculator()
2950     {
2951         return new FirstCalculator(fieldIndex);
2952     }
2953 
2954     final class FirstCalculator : SingleFieldCalculator
2955     {
2956         private bool _done = false;
2957         private string _value = "";
2958 
2959         this(size_t fieldIndex)
2960         {
2961             super(fieldIndex);
2962         }
2963 
2964         final override FirstOperator getOperator()
2965         {
2966             return this.outer;
2967         }
2968 
2969         final override void processNextField(const char[] nextField)
2970         {
2971             if (!_done)
2972             {
2973                 _value = nextField.to!string;
2974                 _done = true;
2975             }
2976         }
2977 
2978         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2979         {
2980             return _value;
2981         }
2982     }
2983 }
2984 
2985 unittest // FirstOperator
2986 {
2987     auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]];
2988     auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]];
2989     auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]];
2990 
2991     testSingleFieldOperator!FirstOperator(col1File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]);
2992     testSingleFieldOperator!FirstOperator(col2File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]);
2993     testSingleFieldOperator!FirstOperator(col2File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]);
2994     testSingleFieldOperator!FirstOperator(col3File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]);
2995     testSingleFieldOperator!FirstOperator(col3File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]);
2996     testSingleFieldOperator!FirstOperator(col3File, 2, "first", ["", "r1c3", "r1c3", "r1c3"]);
2997 
2998     auto col1misFile = [[""], ["r2c1"], ["r3c1"]];
2999     testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "", "r2c1", "r2c1"],
3000                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3001     testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "NA", "NA", "NA"],
3002                                           new MissingFieldPolicy(false, "NA"));  // Replace missing
3003 }
3004 
3005 /** LastOperator outputs the last value found for the field.
3006  */
3007 final class LastOperator : SingleFieldOperator
3008 {
3009     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3010     {
3011         super("last", fieldIndex, missingPolicy);
3012     }
3013 
3014     final override SingleFieldCalculator makeCalculator()
3015     {
3016         return new LastCalculator(fieldIndex);
3017     }
3018 
3019     final class LastCalculator : SingleFieldCalculator
3020     {
3021         private string _value = "";
3022 
3023         this(size_t fieldIndex)
3024         {
3025             super(fieldIndex);
3026         }
3027 
3028         final override LastOperator getOperator()
3029         {
3030             return this.outer;
3031         }
3032 
3033         final override void processNextField(const char[] nextField)
3034         {
3035             _value = nextField.to!string;
3036         }
3037 
3038         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3039         {
3040             return _value;
3041         }
3042     }
3043 }
3044 
3045 unittest // LastOperator
3046 {
3047     auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]];
3048     auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]];
3049     auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]];
3050 
3051     testSingleFieldOperator!LastOperator(col1File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]);
3052     testSingleFieldOperator!LastOperator(col2File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]);
3053     testSingleFieldOperator!LastOperator(col2File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]);
3054     testSingleFieldOperator!LastOperator(col3File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]);
3055     testSingleFieldOperator!LastOperator(col3File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]);
3056     testSingleFieldOperator!LastOperator(col3File, 2, "last", ["", "r1c3", "r2c3", "r3c3"]);
3057 
3058     auto col1misFile = [[""], ["r2c1"], ["r3c1"]];
3059     testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "", "r2c1", "r3c1"],
3060                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3061     testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "NA", "r2c1", "r3c1"],
3062                                           new MissingFieldPolicy(false, "NA"));  // Replace missing
3063 }
3064 
3065 /** MinOperator output the minimum value for the field. This is a numeric operator.
3066  *
3067  * This operator returns the original string without additional numeric formatting.
3068  * This can be useful when joining back to the original data. This is different than
3069  * numeric operators that perform calculations.
3070  */
3071 final class MinOperator : SingleFieldOperator
3072 {
3073     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3074     {
3075         super("min", fieldIndex, missingPolicy);
3076     }
3077 
3078     final override SingleFieldCalculator makeCalculator()
3079     {
3080         return new MinCalculator(fieldIndex);
3081     }
3082 
3083     final class MinCalculator : SingleFieldCalculator
3084     {
3085         private bool _isFirst = true;
3086         private double _value = double.nan;
3087         private string _originalString = "nan";  // Note: Cannot format floats at compile time (version 2.087)
3088 
3089         this(size_t fieldIndex)
3090         {
3091             super(fieldIndex);
3092         }
3093 
3094         final override MinOperator getOperator()
3095         {
3096             return this.outer;
3097         }
3098 
3099         final override void processNextField(const char[] nextField)
3100         {
3101             double fieldValue = nextField.to!double;
3102             if (_isFirst)
3103             {
3104                 _value = fieldValue;
3105                 _originalString = nextField.to!string;
3106                 _isFirst = false;
3107             }
3108             else if (fieldValue < _value)
3109             {
3110                 _value = fieldValue;
3111                 _originalString = nextField.to!string;
3112             }
3113         }
3114 
3115         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3116         {
3117             return _originalString;
3118         }
3119     }
3120 }
3121 
3122 unittest // MinOperator
3123 {
3124     auto col1File = [["10"], ["9.5"], ["11"]];
3125     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3126     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
3127 
3128     testSingleFieldOperator!MinOperator(col1File, 0, "min", ["nan", "10", "9.5", "9.5"]);
3129     testSingleFieldOperator!MinOperator(col2File, 0, "min", ["nan", "20", "20", "20"]);
3130     testSingleFieldOperator!MinOperator(col2File, 1, "min", ["nan", "-30", "-30", "-31"]);
3131     testSingleFieldOperator!MinOperator(col3File, 0, "min", ["nan", "9009", "199", "199"]);
3132     testSingleFieldOperator!MinOperator(col3File, 1, "min", ["nan", "9", "0", "0"]);
3133     testSingleFieldOperator!MinOperator(col3File, 2, "min", ["nan", "-4.5", "-4.5", "-4.5"]);
3134 
3135     auto col1misFile = [[""], ["10"], ["-10"]];
3136     testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "nan", "10", "-10"],
3137                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3138     testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "5", "5", "-10"],
3139                                           new MissingFieldPolicy(false, "5"));  // Replace missing
3140 }
3141 
3142 /** MaxOperator output the maximum value for the field. This is a numeric operator.
3143  *
3144  * This operator returns the original string without additional numeric formatting.
3145  * This can be useful when joining back to the original data. This is different than
3146  * numeric operators that perform calculations.
3147  */
3148 final class MaxOperator : SingleFieldOperator
3149 {
3150     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3151     {
3152         super("max", fieldIndex, missingPolicy);
3153     }
3154 
3155     final override SingleFieldCalculator makeCalculator()
3156     {
3157         return new MaxCalculator(fieldIndex);
3158     }
3159 
3160     final class MaxCalculator : SingleFieldCalculator
3161     {
3162         private bool _isFirst = true;
3163         private double _value = double.nan;
3164         private string _originalString = "nan";  // Note: Cannot format floats at compile time (version 2.087)
3165 
3166         this(size_t fieldIndex)
3167         {
3168             super(fieldIndex);
3169         }
3170 
3171         final override MaxOperator getOperator()
3172         {
3173             return this.outer;
3174         }
3175 
3176         final override void processNextField(const char[] nextField)
3177         {
3178             double fieldValue = nextField.to!double;
3179             if (_isFirst)
3180             {
3181                 _value = fieldValue;
3182                 _originalString = nextField.to!string;
3183                 _isFirst = false;
3184             }
3185             else if (fieldValue > _value)
3186             {
3187                 _value = fieldValue;
3188                 _originalString = nextField.to!string;
3189             }
3190         }
3191 
3192         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3193         {
3194             return _originalString;
3195         }
3196     }
3197 }
3198 
3199 unittest // MaxOperator
3200 {
3201     auto col1File = [["10"], ["9.5"], ["11"]];
3202     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3203     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
3204 
3205     testSingleFieldOperator!MaxOperator(col1File, 0, "max", ["nan", "10", "10", "11"]);
3206     testSingleFieldOperator!MaxOperator(col2File, 0, "max", ["nan", "20", "21", "22"]);
3207     testSingleFieldOperator!MaxOperator(col2File, 1, "max", ["nan", "-30", "-29", "-29"]);
3208     testSingleFieldOperator!MaxOperator(col3File, 0, "max", ["nan", "9009", "9009", "9009"]);
3209     testSingleFieldOperator!MaxOperator(col3File, 1, "max", ["nan", "9", "9", "9"]);
3210     testSingleFieldOperator!MaxOperator(col3File, 2, "max", ["nan", "-4.5", "-0.5", "12"]);
3211 
3212     auto col1misFile = [[""], ["-10"], ["10"]];
3213     testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "nan", "-10", "10"],
3214                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3215     testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "5", "5", "10"],
3216                                           new MissingFieldPolicy(false, "5"));  // Replace missing
3217 }
3218 
3219 /** RangeOperator outputs the difference between the minimum and maximum values.
3220  *
3221  * If there is a single value, or all values are the same, the range is zero. This is
3222  * a numeric operator.
3223  */
3224 final class RangeOperator : SingleFieldOperator
3225 {
3226     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3227     {
3228         super("range", fieldIndex, missingPolicy);
3229     }
3230 
3231     final override SingleFieldCalculator makeCalculator()
3232     {
3233         return new RangeCalculator(fieldIndex);
3234     }
3235 
3236     final class RangeCalculator : SingleFieldCalculator
3237     {
3238         private bool _isFirst = true;
3239         private double _minValue = 0.0;
3240         private double _maxValue = 0.0;
3241 
3242         this(size_t fieldIndex)
3243         {
3244             super(fieldIndex);
3245         }
3246 
3247         final override RangeOperator getOperator()
3248         {
3249             return this.outer;
3250         }
3251 
3252         final override void processNextField(const char[] nextField)
3253         {
3254             double fieldValue = nextField.to!double;
3255             if (_isFirst)
3256             {
3257                 _minValue = _maxValue = fieldValue;
3258                 _isFirst = false;
3259             }
3260             else if (fieldValue > _maxValue)
3261             {
3262                 _maxValue = fieldValue;
3263             }
3264             else if (fieldValue < _minValue)
3265             {
3266                 _minValue = fieldValue;
3267             }
3268         }
3269 
3270         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3271         {
3272             return printOptions.formatNumber(_maxValue - _minValue);
3273         }
3274     }
3275 }
3276 
3277 unittest // RangeOperator
3278 {
3279     auto col1File = [["10"], ["9.5"], ["11"]];
3280     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3281     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
3282 
3283     testSingleFieldOperator!RangeOperator(col1File, 0, "range", ["0", "0", "0.5", "1.5"]);
3284     testSingleFieldOperator!RangeOperator(col2File, 0, "range", ["0", "0", "1", "2"]);
3285     testSingleFieldOperator!RangeOperator(col2File, 1, "range", ["0", "0", "1", "2"]);
3286     testSingleFieldOperator!RangeOperator(col3File, 0, "range", ["0", "0", "8810", "8810"]);
3287     testSingleFieldOperator!RangeOperator(col3File, 1, "range", ["0", "0", "9", "9"]);
3288     testSingleFieldOperator!RangeOperator(col3File, 2, "range", ["0", "0", "4", "16.5"]);
3289 
3290     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]];
3291     testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "0", "0", "0.5", "1.5"],
3292                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3293     testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "4.5", "4.5", "4.5", "5.5"],
3294                                           new MissingFieldPolicy(false, "5.5"));  // Replace missing
3295 }
3296 
3297 /** SumOperator produces the sum of all the values. This is a numeric operator.
3298  */
3299 final class SumOperator : SingleFieldOperator
3300 {
3301     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3302     {
3303         super("sum", fieldIndex, missingPolicy);
3304     }
3305 
3306     final override SingleFieldCalculator makeCalculator()
3307     {
3308         return new SumCalculator(fieldIndex);
3309     }
3310 
3311     final class SumCalculator : SingleFieldCalculator
3312     {
3313         private double _total = 0.0;
3314 
3315         this(size_t fieldIndex)
3316         {
3317             super(fieldIndex);
3318         }
3319 
3320         final override SumOperator getOperator()
3321         {
3322             return this.outer;
3323         }
3324 
3325         final override void processNextField(const char[] nextField)
3326         {
3327             _total += nextField.to!double;
3328         }
3329 
3330         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3331         {
3332             return printOptions.formatNumber(_total);
3333         }
3334     }
3335 }
3336 
3337 unittest // SumOperator
3338 {
3339     auto col1File = [["10"], ["9.5"], ["11"]];
3340     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3341     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
3342 
3343     testSingleFieldOperator!SumOperator(col1File, 0, "sum", ["0", "10", "19.5", "30.5"]);
3344     testSingleFieldOperator!SumOperator(col2File, 0, "sum", ["0", "20", "41", "63"]);
3345     testSingleFieldOperator!SumOperator(col2File, 1, "sum", ["0", "-30", "-59", "-90"]);
3346     testSingleFieldOperator!SumOperator(col3File, 0, "sum", ["0", "9009", "9208", "12211"]);
3347     testSingleFieldOperator!SumOperator(col3File, 1, "sum", ["0", "9", "9", "9.2"]);
3348     testSingleFieldOperator!SumOperator(col3File, 2, "sum", ["0", "-4.5", "-5", "7"]);
3349 
3350     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]];
3351     testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "0", "10", "10", "19.5", "30.5"],
3352                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3353     testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "1.5", "11.5", "13", "22.5", "33.5"],
3354                                           new MissingFieldPolicy(false, "1.5"));  // Replace missing
3355 }
3356 
3357 /** MeanOperator produces the mean (average) of all the values. This is a numeric operator.
3358  */
3359 final class MeanOperator : SingleFieldOperator
3360 {
3361     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3362     {
3363         super("mean", fieldIndex, missingPolicy);
3364     }
3365 
3366     final override SingleFieldCalculator makeCalculator()
3367     {
3368         return new MeanCalculator(fieldIndex);
3369     }
3370 
3371     final class MeanCalculator : SingleFieldCalculator
3372     {
3373         private double _total = 0.0;
3374         private size_t _count = 0;
3375 
3376         this(size_t fieldIndex)
3377         {
3378             super(fieldIndex);
3379         }
3380 
3381         final override MeanOperator getOperator()
3382         {
3383             return this.outer;
3384         }
3385 
3386         final override void processNextField(const char[] nextField)
3387         {
3388             _total += nextField.to!double;
3389             _count++;
3390         }
3391 
3392         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3393         {
3394             return printOptions.formatNumber(
3395                 (_count > 0) ? (_total / _count.to!double) : double.nan);
3396         }
3397     }
3398 }
3399 
3400 unittest // MeanOperator
3401 {
3402     auto col1File = [["10"], ["9.5"], ["7.5"]];
3403     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3404     auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]];
3405 
3406     testSingleFieldOperator!MeanOperator(col1File, 0, "mean", ["nan", "10", "9.75", "9"]);
3407     testSingleFieldOperator!MeanOperator(col2File, 0, "mean", ["nan", "20", "20.5", "21"]);
3408     testSingleFieldOperator!MeanOperator(col2File, 1, "mean", ["nan", "-30", "-29.5", "-30"]);
3409     testSingleFieldOperator!MeanOperator(col3File, 0, "mean", ["nan", "9009", "4509", "4509"]);
3410     testSingleFieldOperator!MeanOperator(col3File, 1, "mean", ["nan", "9", "4.5", "2"]);
3411     testSingleFieldOperator!MeanOperator(col3File, 2, "mean", ["nan", "-4.5", "-3", "2"]);
3412 
3413     auto col1misFile = [[""], ["6"], [""], ["14"], ["40"]];
3414     testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "nan", "6", "6", "10", "20"],
3415                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3416     testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "0", "3", "2", "5", "12"],
3417                                           new MissingFieldPolicy(false, "0"));  // Replace missing
3418 }
3419 
3420 /** MedianOperator produces the median of all the values. This is a numeric operator.
3421  *
3422  * All the field values are stored in memory as part of this calculation. This is
3423  * handled by unique key value lists.
3424  */
3425 final class MedianOperator : SingleFieldOperator
3426 {
3427     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3428     {
3429         super("median", fieldIndex, missingPolicy);
3430         setSaveFieldValuesNumeric();
3431     }
3432 
3433     final override SingleFieldCalculator makeCalculator()
3434     {
3435         return new MedianCalculator(fieldIndex);
3436     }
3437 
3438     final class MedianCalculator : SingleFieldCalculator
3439     {
3440         this(size_t fieldIndex)
3441         {
3442             super(fieldIndex);
3443         }
3444 
3445         final override MedianOperator getOperator()
3446         {
3447             return this.outer;
3448         }
3449 
3450         /* Work is done by saving the field values. */
3451         final override void processNextField(const char[] nextField)
3452         { }
3453 
3454         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3455         {
3456             return printOptions.formatNumber(valuesLists.numericValuesMedian(fieldIndex));
3457         }
3458     }
3459 }
3460 
3461 unittest // MedianOperator
3462 {
3463     auto col1File = [["10"], ["9.5"], ["7.5"]];
3464     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3465     auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]];
3466 
3467     testSingleFieldOperator!MedianOperator(col1File, 0, "median", ["nan", "10", "9.75", "9.5"]);
3468     testSingleFieldOperator!MedianOperator(col2File, 0, "median", ["nan", "20", "20.5", "21"]);
3469     testSingleFieldOperator!MedianOperator(col2File, 1, "median", ["nan", "-30", "-29.5", "-30"]);
3470     testSingleFieldOperator!MedianOperator(col3File, 0, "median", ["nan", "9009", "4509", "4509"]);
3471     testSingleFieldOperator!MedianOperator(col3File, 1, "median", ["nan", "9", "4.5", "0"]);
3472     testSingleFieldOperator!MedianOperator(col3File, 2, "median", ["nan", "-4.5", "-3", "-1.5"]);
3473 
3474     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]];
3475     testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "nan", "10", "10", "9.75", "9.5"],
3476                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3477     testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "0", "5", "0", "4.75", "7.5"],
3478                                           new MissingFieldPolicy(false, "0"));  // Replace missing
3479 }
3480 
3481 /** QuantileOperator produces the value representing the data at a cummulative probability.
3482  * This is a numeric operation.
3483  *
3484  * As an example, quantiles might be produced for the 0.25, 0.5, and 0.75 probabilities
3485  * (alternately, the 25th, 50th, and 75th percentile ranks, the 50th percentile being the
3486  * median). Data is sorted is ascending order. This operator takes one percentile, but it
3487  * is common to generate multiple quantile ranks for the same field when summarizing.
3488  *
3489  * All the field's values are stored in memory as part of this calculation. This is
3490  * handled by unique key value lists.
3491  */
3492 final class QuantileOperator : SingleFieldOperator
3493 {
3494     private double _prob;
3495 
3496     this(size_t fieldIndex, MissingFieldPolicy missingPolicy, double probability)
3497     {
3498         assert(0.0 <= probability && probability <= 1.0);
3499         import std.format : format;
3500 
3501         string header = (probability == 0.0) ? "pct0" : format("pct%02g", probability * 100.0);
3502         super(header, fieldIndex, missingPolicy);
3503         _prob = probability;
3504         setSaveFieldValuesNumeric();
3505     }
3506 
3507     final override SingleFieldCalculator makeCalculator()
3508     {
3509         return new QuantileCalculator(fieldIndex);
3510     }
3511 
3512     final class QuantileCalculator : SingleFieldCalculator
3513     {
3514         this(size_t fieldIndex)
3515         {
3516             super(fieldIndex);
3517         }
3518 
3519         final override QuantileOperator getOperator()
3520         {
3521             return this.outer;
3522         }
3523 
3524         /* Work is done by saving the field values. */
3525         final override void processNextField(const char[] nextField)
3526         { }
3527 
3528         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3529         {
3530             import tsv_utils.common.numerics : quantile;
3531             return printOptions.formatNumber(
3532                 quantile(this.outer._prob, valuesLists.numericValuesSorted(fieldIndex)));
3533         }
3534     }
3535 }
3536 
3537 unittest // QuantileOperator
3538 {
3539     auto col1File = [["10"], ["9.5"], ["7.5"]];
3540     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3541     auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]];
3542 
3543     auto defaultMissing = new MissingFieldPolicy;
3544 
3545     /* Same as the median tests. */
3546     testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct50", ["nan", "10", "9.75", "9.5"], defaultMissing, 0.50);
3547     testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct50", ["nan", "20", "20.5", "21"], defaultMissing, 0.50);
3548     testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct50", ["nan", "-30", "-29.5", "-30"], defaultMissing, 0.50);
3549     testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct50", ["nan", "9009", "4509", "4509"], defaultMissing, 0.50);
3550     testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct50", ["nan", "9", "4.5", "0"], defaultMissing, 0.50);
3551     testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct50", ["nan", "-4.5", "-3", "-1.5"], defaultMissing, 0.50);
3552 
3553     /* The extremes (0, 1), are min and max. */
3554     testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct0", ["nan", "10", "9.5", "7.5"], defaultMissing, 0.0);
3555     testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct0", ["nan", "20", "20", "20"], defaultMissing, 0.0);
3556     testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct0", ["nan", "-30", "-30", "-31"], defaultMissing, 0.0);
3557     testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct0", ["nan", "9009", "9", "9"], defaultMissing, 0.0);
3558     testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct0", ["nan", "9", "0", "-3"], defaultMissing, 0.0);
3559     testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct0", ["nan", "-4.5", "-4.5", "-4.5"], defaultMissing, 0.0);
3560 
3561     testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct100", ["nan", "10", "10", "10"], defaultMissing, 1.0);
3562     testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct100", ["nan", "20", "21", "22"], defaultMissing, 1.0);
3563     testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct100", ["nan", "-30", "-29", "-29"], defaultMissing, 1.0);
3564     testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct100", ["nan", "9009", "9009", "9009"], defaultMissing, 1.0);
3565     testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct100", ["nan", "9", "9", "9"], defaultMissing, 1.0);
3566     testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct100", ["nan", "-4.5", "-1.5", "12"], defaultMissing, 1.0);
3567 
3568     /* For missing policies, re-use the median tests. */
3569     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]];
3570     testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "nan", "10", "10", "9.75", "9.5"],
3571                                                  new MissingFieldPolicy(true, ""), 0.5);  // Exclude missing
3572     testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "0", "5", "0", "4.75", "7.5"],
3573                                                  new MissingFieldPolicy(false, "0"), 0.5);  // Replace missing
3574 }
3575 
3576 /** MadOperator produces the median absolute deviation from the median. This is a numeric
3577  * operation.
3578  *
3579  * The result is the raw MAD value, without a normalization applied.
3580  *
3581  * All the field values are stored in memory as part of this calculation. This is
3582  * handled by unique key value lists.
3583  */
3584 final class MadOperator : SingleFieldOperator
3585 {
3586     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3587     {
3588         super("mad", fieldIndex, missingPolicy);
3589         setSaveFieldValuesNumeric();
3590     }
3591 
3592     final override SingleFieldCalculator makeCalculator()
3593     {
3594         return new MadCalculator(fieldIndex);
3595     }
3596 
3597     final class MadCalculator : SingleFieldCalculator
3598     {
3599         this(size_t fieldIndex)
3600         {
3601             super(fieldIndex);
3602         }
3603 
3604         final override MadOperator getOperator()
3605         {
3606             return this.outer;
3607         }
3608 
3609         /* Work is done by saving the field values. */
3610         final override void processNextField(const char[] nextField)
3611         { }
3612 
3613         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3614         {
3615             import std.math : abs;
3616             import tsv_utils.common.numerics : rangeMedian;
3617 
3618             auto median = valuesLists.numericValuesMedian(fieldIndex);
3619             auto values = valuesLists.numericValues(fieldIndex);
3620             auto medianDevs = new double[values.length];
3621             foreach (size_t i, double v; values)
3622                 medianDevs[i] = abs(v - median);
3623 
3624             return printOptions.formatNumber(medianDevs.rangeMedian);
3625         }
3626     }
3627 }
3628 
3629 unittest // MadOperator
3630 {
3631     auto col1File = [["10"], ["15"], ["20"], ["25"], ["30"]];
3632     auto col2File = [["2", "50"], ["2", "51"], ["2", "52"]];
3633     auto col3File = [["16", "8", "-4"], ["8", "8", "-2"], ["8", "16", "0"]];
3634 
3635     testSingleFieldOperator!MadOperator(col1File, 0, "mad", ["nan", "0", "2.5", "5", "5", "5"]);
3636     testSingleFieldOperator!MadOperator(col2File, 0, "mad", ["nan", "0", "0", "0"]);
3637     testSingleFieldOperator!MadOperator(col2File, 1, "mad", ["nan", "0", "0.5", "1"]);
3638     testSingleFieldOperator!MadOperator(col3File, 0, "mad", ["nan", "0", "4", "0"]);
3639     testSingleFieldOperator!MadOperator(col3File, 1, "mad", ["nan", "0", "0", "0"]);
3640     testSingleFieldOperator!MadOperator(col3File, 2, "mad", ["nan", "0", "1", "2"]);
3641 
3642     auto col1misFile = [[""], ["16"], [""], ["32"], ["-4"]];
3643     testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "nan", "0", "0", "8", "16"],
3644                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3645     testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "0", "8", "0", "8", "4"],
3646                                           new MissingFieldPolicy(false, "0"));  // Replace missing
3647 }
3648 
3649 /** Generates the variance of the fields values. This is a numeric operator.
3650  */
3651 final class VarianceOperator : SingleFieldOperator
3652 {
3653     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3654     {
3655         super("var", fieldIndex, missingPolicy);
3656     }
3657 
3658     final override SingleFieldCalculator makeCalculator()
3659     {
3660         return new VarianceCalculator(fieldIndex);
3661     }
3662 
3663     final class VarianceCalculator : SingleFieldCalculator
3664     {
3665         private double _count = 0.0;
3666         private double _mean = 0.0;
3667         private double _m2 = 0.0;     // Sum of squares of differences from current mean
3668 
3669         this(size_t fieldIndex)
3670         {
3671             super(fieldIndex);
3672         }
3673 
3674         final override VarianceOperator getOperator()
3675         {
3676             return this.outer;
3677         }
3678 
3679         final override void processNextField(const char[] nextField)
3680         {
3681             _count += 1.0;
3682             double fieldValue = nextField.to!double;
3683             double delta = fieldValue - _mean;
3684             _mean += delta / _count;
3685             _m2 += delta * (fieldValue - _mean);
3686         }
3687 
3688         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3689         {
3690             return printOptions.formatNumber(
3691                 (_count >= 2.0) ? (_m2 / (_count - 1.0)) : double.nan);
3692         }
3693     }
3694 }
3695 
3696 unittest // VarianceOperator
3697 {
3698     auto col1File = [["5"], ["10"], ["15"]];
3699     auto col2File = [["-5", "-5"], ["-10", "0"], ["-15", "5"]];
3700     auto col3File = [["1", "2", "100"], ["2", "3", "100"], ["3", "4", "103"]];
3701 
3702     testSingleFieldOperator!VarianceOperator(col1File, 0, "var", ["nan", "nan", "12.5", "25"]);
3703     testSingleFieldOperator!VarianceOperator(col2File, 0, "var", ["nan", "nan", "12.5", "25"]);
3704     testSingleFieldOperator!VarianceOperator(col2File, 1, "var", ["nan", "nan", "12.5", "25"]);
3705     testSingleFieldOperator!VarianceOperator(col3File, 0, "var", ["nan", "nan", "0.5", "1"]);
3706     testSingleFieldOperator!VarianceOperator(col3File, 1, "var", ["nan", "nan", "0.5", "1"]);
3707     testSingleFieldOperator!VarianceOperator(col3File, 2, "var", ["nan", "nan", "0", "3"]);
3708 
3709     auto col1misFile = [["5"], ["10"], [""]];
3710     testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "12.5"],
3711                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3712     testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "25"],
3713                                           new MissingFieldPolicy(false, "15"));  // Replace missing
3714 }
3715 
3716 /** Generates the standard deviation of the fields values. This is a numeric operator.
3717  */
3718 final class StDevOperator : SingleFieldOperator
3719 {
3720     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3721     {
3722         super("stdev", fieldIndex, missingPolicy);
3723     }
3724 
3725     final override SingleFieldCalculator makeCalculator()
3726     {
3727         return new StDevCalculator(fieldIndex);
3728     }
3729 
3730     final class StDevCalculator : SingleFieldCalculator
3731     {
3732         private double _count = 0.0;
3733         private double _mean = 0.0;
3734         private double _m2 = 0.0;     // Sum of squares of differences from current mean
3735 
3736         this(size_t fieldIndex)
3737         {
3738             super(fieldIndex);
3739         }
3740 
3741         final override StDevOperator getOperator()
3742         {
3743             return this.outer;
3744         }
3745 
3746         final override void processNextField(const char[] nextField)
3747         {
3748             _count += 1.0;
3749             double fieldValue = nextField.to!double;
3750             double delta = fieldValue - _mean;
3751             _mean += delta / _count;
3752             _m2 += delta * (fieldValue - _mean);
3753         }
3754 
3755         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3756         {
3757             import std.math : sqrt;
3758             return printOptions.formatNumber(
3759                 (_count >= 2.0) ? (_m2 / (_count - 1.0)).sqrt : double.nan);
3760         }
3761     }
3762 }
3763 
3764 /* StDevOperator unit tests - These would be improved with a tolerance option.
3765  */
3766 unittest
3767 {
3768     auto col1File = [["1"], ["4"], ["7"]];
3769     auto col2File = [["3", "3"], ["3", "9"], ["7", "15"]];
3770     auto col3File = [["11", "10", "10"], ["24", "22", "25"], ["37", "34", "40"]];
3771 
3772     testSingleFieldOperator!StDevOperator(col1File, 0, "stdev", ["nan", "nan", "2.12132034356", "3"]);
3773     testSingleFieldOperator!StDevOperator(col2File, 0, "stdev", ["nan", "nan", "0", "2.30940107676"]);
3774     testSingleFieldOperator!StDevOperator(col2File, 1, "stdev", ["nan", "nan", "4.24264068712", "6"]);
3775     testSingleFieldOperator!StDevOperator(col3File, 0, "stdev", ["nan", "nan", "9.19238815543", "13"]);
3776     testSingleFieldOperator!StDevOperator(col3File, 1, "stdev", ["nan", "nan", "8.48528137424", "12"]);
3777     testSingleFieldOperator!StDevOperator(col3File, 2, "stdev", ["nan", "nan", "10.6066017178", "15"]);
3778 
3779     auto col1misFile = [["1"], ["4"], [""]];
3780     testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "2.12132034356"],
3781                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3782     testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "3"],
3783                                           new MissingFieldPolicy(false, "7"));  // Replace missing
3784 }
3785 
3786 /** UniqueCountOperator generates the number of unique values. Unique values are
3787  * based on exact text match calculation, not a numeric comparison.
3788  *
3789  * All the unique field values are stored in memory as part of this calculation.
3790  */
3791 final class UniqueCountOperator : SingleFieldOperator
3792 {
3793     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3794     {
3795         super("unique_count", fieldIndex, missingPolicy);
3796     }
3797 
3798     final override SingleFieldCalculator makeCalculator()
3799     {
3800         return new UniqueCountCalculator(fieldIndex);
3801     }
3802 
3803     final class UniqueCountCalculator : SingleFieldCalculator
3804     {
3805         private bool[string] _values;
3806 
3807         this(size_t fieldIndex)
3808         {
3809             super(fieldIndex);
3810         }
3811 
3812         final override UniqueCountOperator getOperator()
3813         {
3814             return this.outer;
3815         }
3816 
3817         final override void processNextField(const char[] nextField)
3818         {
3819             if (nextField !in _values) _values[nextField.to!string] = true;
3820         }
3821 
3822         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3823         {
3824             return printOptions.formatNumber(_values.length);
3825         }
3826     }
3827 }
3828 
3829 unittest // UniqueCount
3830 {
3831     auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]];
3832     auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]];
3833     auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]];
3834 
3835     testSingleFieldOperator!UniqueCountOperator(col1File, 0, "unique_count", ["0", "1", "2", "3", "3", "3", "3", "3", "4"]);
3836     testSingleFieldOperator!UniqueCountOperator(col2File, 0, "unique_count", ["0", "1", "2", "2"]);
3837     testSingleFieldOperator!UniqueCountOperator(col2File, 1, "unique_count", ["0", "1", "1", "2"]);
3838     testSingleFieldOperator!UniqueCountOperator(col3File, 0, "unique_count", ["0", "1", "2", "3"]);
3839     testSingleFieldOperator!UniqueCountOperator(col3File, 1, "unique_count", ["0", "1", "2", "2"]);
3840     testSingleFieldOperator!UniqueCountOperator(col3File, 2, "unique_count", ["0", "1", "2", "3"]);
3841 
3842     auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]];
3843     testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "0", "1", "1", "2", "3", "3", "3", "3", "3", "4"],
3844                                                 new MissingFieldPolicy(true, ""));  // Exclude missing
3845 
3846 
3847     testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "1", "2", "2", "3", "4", "4", "4", "4", "4", "5"],
3848                                                 new MissingFieldPolicy(false, "XYZ"));  // Replace missing
3849 }
3850 
3851 /** MissingCountOperator generates the number of missing values. This overrides
3852  * the global missingFieldsPolicy.
3853  */
3854 final class MissingCountOperator : SingleFieldOperator
3855 {
3856     private MissingFieldPolicy _globalMissingPolicy;
3857 
3858     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3859     {
3860         _globalMissingPolicy = missingPolicy;
3861         super("missing_count", fieldIndex, new MissingFieldPolicy(false, ""));
3862     }
3863 
3864     final override SingleFieldCalculator makeCalculator()
3865     {
3866         return new MissingCountCalculator(fieldIndex);
3867     }
3868 
3869     final class MissingCountCalculator : SingleFieldCalculator
3870     {
3871         private size_t _missingCount = 0;
3872 
3873         this(size_t fieldIndex)
3874         {
3875             super(fieldIndex);
3876         }
3877 
3878         final override MissingCountOperator getOperator()
3879         {
3880             return this.outer;
3881         }
3882 
3883         final override void processNextField(const char[] nextField)
3884         {
3885             if (this.outer._globalMissingPolicy.isMissingField(nextField)) _missingCount++;
3886         }
3887 
3888         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3889         {
3890             return printOptions.formatNumber(_missingCount);
3891         }
3892     }
3893 }
3894 
3895 unittest // MissingCount
3896 {
3897     auto col1File = [["a"], ["b"], [""], [" "], [""]];
3898     auto col2File = [["abc", ""], ["", ""], ["def", ""]];
3899     auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]];
3900 
3901     testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"]);
3902     testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"]);
3903     testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"]);
3904     testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"]);
3905     testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"]);
3906     testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"]);
3907 
3908     auto excludeMissing = new MissingFieldPolicy(true, "");
3909     auto replaceMissing = new MissingFieldPolicy(false, "X");
3910 
3911     testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], excludeMissing);
3912     testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], excludeMissing);
3913     testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], excludeMissing);
3914     testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], excludeMissing);
3915     testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], excludeMissing);
3916     testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], excludeMissing);
3917 
3918     testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], replaceMissing);
3919     testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], replaceMissing);
3920     testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], replaceMissing);
3921     testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], replaceMissing);
3922     testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], replaceMissing);
3923     testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], replaceMissing);
3924 }
3925 
3926 /** NotMissingCountOperator generates the number of not-missing values. This overrides
3927  * the global missingFieldsPolicy.
3928  */
3929 final class NotMissingCountOperator : SingleFieldOperator
3930 {
3931     private MissingFieldPolicy _globalMissingPolicy;
3932 
3933     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3934     {
3935         _globalMissingPolicy = missingPolicy;
3936         super("not_missing_count", fieldIndex, new MissingFieldPolicy(false, ""));
3937     }
3938 
3939     final override SingleFieldCalculator makeCalculator()
3940     {
3941         return new NotMissingCountCalculator(fieldIndex);
3942     }
3943 
3944     final class NotMissingCountCalculator : SingleFieldCalculator
3945     {
3946         private size_t _notMissingCount = 0;
3947 
3948         this(size_t fieldIndex)
3949         {
3950             super(fieldIndex);
3951         }
3952 
3953         final override NotMissingCountOperator getOperator()
3954         {
3955             return this.outer;
3956         }
3957 
3958         final override void processNextField(const char[] nextField)
3959         {
3960             if (!this.outer._globalMissingPolicy.isMissingField(nextField)) _notMissingCount++;
3961         }
3962 
3963         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3964         {
3965             return printOptions.formatNumber(_notMissingCount);
3966         }
3967     }
3968 }
3969 
3970 unittest // NotMissingCount
3971 {
3972     auto col1File = [["a"], ["b"], [""], [" "], [""]];
3973     auto col2File = [["abc", ""], ["", ""], ["def", ""]];
3974     auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]];
3975 
3976     testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"]);
3977     testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"]);
3978     testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"]);
3979     testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"]);
3980     testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"]);
3981     testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"]);
3982 
3983     auto excludeMissing = new MissingFieldPolicy(true, "");
3984     auto replaceMissing = new MissingFieldPolicy(false, "X");
3985 
3986     testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], excludeMissing);
3987     testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], excludeMissing);
3988     testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], excludeMissing);
3989     testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], excludeMissing);
3990     testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], excludeMissing);
3991     testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], excludeMissing);
3992 
3993     testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], replaceMissing);
3994     testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], replaceMissing);
3995     testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], replaceMissing);
3996     testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], replaceMissing);
3997     testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], replaceMissing);
3998     testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], replaceMissing);
3999 }
4000 
4001 /** ModeOperator outputs the most frequent value seen. In the event of a tie, the
4002  * first value seen is produced.
4003  *
4004  * All the field values are stored in memory as part of this calculation.
4005  *
4006  */
4007 final class ModeOperator : SingleFieldOperator
4008 {
4009     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
4010     {
4011         super("mode", fieldIndex, missingPolicy);
4012     }
4013 
4014     final override SingleFieldCalculator makeCalculator()
4015     {
4016         return new ModeCalculator(fieldIndex);
4017     }
4018 
4019     final class ModeCalculator : SingleFieldCalculator
4020     {
4021         private size_t[string] _valueCounts;
4022         private Appender!(string[]) _uniqueValues;
4023 
4024         this(size_t fieldIndex)
4025         {
4026             super(fieldIndex);
4027         }
4028 
4029         final override ModeOperator getOperator()
4030         {
4031             return this.outer;
4032         }
4033 
4034         final override void processNextField(const char[] nextField)
4035         {
4036             auto countPtr = (nextField in _valueCounts);
4037 
4038             if (countPtr is null)
4039             {
4040                 string value = nextField.to!string;
4041                 _uniqueValues.put(value);
4042                 _valueCounts[value] = 1;
4043             }
4044             else
4045             {
4046                 (*countPtr)++;
4047             }
4048         }
4049 
4050         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
4051         {
4052             string modeValue = "";
4053             size_t modeCount = 0;
4054 
4055             foreach (value; _uniqueValues.data)
4056             {
4057                 assert(value in _valueCounts);
4058 
4059                 auto count = _valueCounts[value];
4060 
4061                 if (count > modeCount)
4062                 {
4063                     modeValue = value;
4064                     modeCount = count;
4065                 }
4066             }
4067 
4068             return modeValue;
4069         }
4070     }
4071 }
4072 
4073 unittest // ModeOperator
4074 {
4075     auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]];
4076     auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]];
4077     auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]];
4078 
4079     testSingleFieldOperator!ModeOperator(col1File, 0, "mode", ["", "a", "a", "a", "c", "b", "b", "b"]);
4080     testSingleFieldOperator!ModeOperator(col2File, 0, "mode", ["", "abc", "abc", "def"]);
4081     testSingleFieldOperator!ModeOperator(col2File, 1, "mode", ["", "pqr", "pqr", "pqr"]);
4082     testSingleFieldOperator!ModeOperator(col3File, 0, "mode", ["", "1.0", "1.0", "1.0"]);
4083     testSingleFieldOperator!ModeOperator(col3File, 1, "mode", ["", "1", "1", "a"]);
4084     testSingleFieldOperator!ModeOperator(col3File, 2, "mode", ["", "a", "a", "a"]);
4085 
4086     auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]];
4087     testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "", "a", "a", "a", "a", "c", "b", "b"],
4088                                          new MissingFieldPolicy(true, ""));  // Exclude missing
4089 
4090 
4091     testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "X", "X", "X", "X", "X", "X", "X", "b"],
4092                                          new MissingFieldPolicy(false, "X"));  // Replace missing
4093 }
4094 
4095 /** ModeCountOperator outputs the count of the most frequent value seen.
4096  *
4097  * All the field values are stored in memory as part of this calculation.
4098  *
4099  */
4100 final class ModeCountOperator : SingleFieldOperator
4101 {
4102     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
4103     {
4104         super("mode_count", fieldIndex, missingPolicy);
4105     }
4106 
4107     final override SingleFieldCalculator makeCalculator()
4108     {
4109         return new ModeCountCalculator(fieldIndex);
4110     }
4111 
4112     final class ModeCountCalculator : SingleFieldCalculator
4113     {
4114         private size_t[string] _valueCounts;
4115 
4116         this(size_t fieldIndex)
4117         {
4118             super(fieldIndex);
4119         }
4120 
4121         final override ModeCountOperator getOperator()
4122         {
4123             return this.outer;
4124         }
4125 
4126         final override void processNextField(const char[] nextField)
4127         {
4128             auto countPtr = (nextField in _valueCounts);
4129 
4130             if (countPtr is null)
4131             {
4132                 string value = nextField.to!string;
4133                 _valueCounts[value] = 1;
4134             }
4135             else
4136             {
4137                 (*countPtr)++;
4138             }
4139         }
4140 
4141         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
4142         {
4143             size_t modeCount = 0;
4144             foreach (count; _valueCounts.byValue) if (count > modeCount) modeCount = count;
4145             return printOptions.formatNumber(modeCount);
4146         }
4147     }
4148 }
4149 
4150 unittest // ModeCountOperator
4151 {
4152     auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]];
4153     auto col2File = [["abc", ""], ["def", ""], ["def", "xyz"]];
4154     auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]];
4155 
4156     testSingleFieldOperator!ModeCountOperator(col1File, 0, "mode_count", ["0", "1", "1", "1", "2", "2", "3", "3"]);
4157     testSingleFieldOperator!ModeCountOperator(col2File, 0, "mode_count", ["0", "1", "1", "2"]);
4158     testSingleFieldOperator!ModeCountOperator(col2File, 1, "mode_count", ["0", "1", "2", "2"]);
4159     testSingleFieldOperator!ModeCountOperator(col3File, 0, "mode_count", ["0", "1", "1", "1"]);
4160     testSingleFieldOperator!ModeCountOperator(col3File, 1, "mode_count", ["0", "1", "1", "2"]);
4161     testSingleFieldOperator!ModeCountOperator(col3File, 2, "mode_count", ["0", "1", "1", "1"]);
4162 
4163     auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]];
4164     testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "0", "1", "1", "1", "1", "2", "2", "3"],
4165                                               new MissingFieldPolicy(true, ""));  // Exclude missing
4166 
4167 
4168     testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "1", "1", "2", "2", "2", "2", "2", "3"],
4169                                               new MissingFieldPolicy(false, "X"));  // Replace missing
4170 }
4171 
4172 /** ValuesOperator outputs each value delimited by an alternate delimiter character.
4173  *
4174  * All the field values are stored in memory as part of this calculation. This is
4175  * handled by unique key value lists.
4176  */
4177 
4178 final class ValuesOperator : SingleFieldOperator
4179 {
4180     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
4181     {
4182         super("values", fieldIndex, missingPolicy);
4183         setSaveFieldValuesText();
4184     }
4185 
4186     final override SingleFieldCalculator makeCalculator()
4187     {
4188         return new ValuesCalculator(fieldIndex);
4189     }
4190 
4191     final class ValuesCalculator : SingleFieldCalculator
4192     {
4193         this(size_t fieldIndex)
4194         {
4195             super(fieldIndex);
4196         }
4197 
4198         final override ValuesOperator getOperator()
4199         {
4200             return this.outer;
4201         }
4202 
4203         /* Work is done by saving the field values. */
4204         final override void processNextField(const char[] nextField)
4205         { }
4206 
4207         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
4208         {
4209             return valuesLists.textValues(fieldIndex).join(printOptions.valuesDelimiter);
4210         }
4211     }
4212 }
4213 
4214 unittest // ValuesOperator
4215 {
4216     auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]];
4217     auto col2File = [["", "50"], ["", "51"], ["xyz", "52"]];
4218     auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "---"]];
4219 
4220     testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e|", "a||b|cd|e||a"]);
4221     testSingleFieldOperator!ValuesOperator(col2File, 0, "values", ["", "", "|", "||xyz"]);
4222     testSingleFieldOperator!ValuesOperator(col2File, 1, "values", ["", "50", "50|51", "50|51|52"]);
4223     testSingleFieldOperator!ValuesOperator(col3File, 0, "values", ["", "z", "z|y", "z|y|w"]);
4224     testSingleFieldOperator!ValuesOperator(col3File, 1, "values", ["", "a", "a|ab", "a|ab|ba"]);
4225     testSingleFieldOperator!ValuesOperator(col3File, 2, "values", ["", "-", "-|--", "-|--|---"]);
4226 
4227     testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e|a"],
4228                                          new MissingFieldPolicy(true, ""));  // Exclude missing
4229 
4230 
4231     testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e|X", "a|X|b|cd|e|X|a"],
4232                                          new MissingFieldPolicy(false, "X"));  // Replace missing
4233 }
4234 
4235 /** UniqueValuesOperator outputs each unique value delimited by an alternate delimiter
4236  * character. Values are output in the order seen.
4237  *
4238  * All unique field values are stored in memory as part of this calculation.
4239  *
4240  */
4241 final class UniqueValuesOperator : SingleFieldOperator
4242 {
4243     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
4244     {
4245         super("unique_values", fieldIndex, missingPolicy);
4246     }
4247 
4248     final override SingleFieldCalculator makeCalculator()
4249     {
4250         return new UniqueValuesCalculator(fieldIndex);
4251     }
4252 
4253     final class UniqueValuesCalculator : SingleFieldCalculator
4254     {
4255         private size_t[string] _valuesHash;
4256         private Appender!(string[]) _uniqueValues;
4257 
4258         this(size_t fieldIndex)
4259         {
4260             super(fieldIndex);
4261         }
4262 
4263         final override UniqueValuesOperator getOperator()
4264         {
4265             return this.outer;
4266         }
4267 
4268         final override void processNextField(const char[] nextField)
4269         {
4270             auto ptr = (nextField in _valuesHash);
4271 
4272             if (ptr is null)
4273             {
4274                 string value = nextField.to!string;
4275                 _uniqueValues.put(value);
4276                 _valuesHash[value] = 1;
4277             }
4278         }
4279 
4280         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
4281         {
4282             return _uniqueValues.data.join(printOptions.valuesDelimiter);
4283         }
4284     }
4285 }
4286 
4287 unittest // UniqueValuesOperator
4288 {
4289     auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]];
4290     auto col2File = [["", "50"], ["", "50"], ["xyz", "52"]];
4291     auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "-"]];
4292 
4293     testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e", "a||b|cd|e"]);
4294     testSingleFieldOperator!UniqueValuesOperator(col2File, 0, "unique_values", ["", "", "", "|xyz"]);
4295     testSingleFieldOperator!UniqueValuesOperator(col2File, 1, "unique_values", ["", "50", "50", "50|52"]);
4296     testSingleFieldOperator!UniqueValuesOperator(col3File, 0, "unique_values", ["", "z", "z|y", "z|y|w"]);
4297     testSingleFieldOperator!UniqueValuesOperator(col3File, 1, "unique_values", ["", "a", "a|ab", "a|ab|ba"]);
4298     testSingleFieldOperator!UniqueValuesOperator(col3File, 2, "unique_values", ["", "-", "-|--", "-|--"]);
4299 
4300     testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e"],
4301                                                  new MissingFieldPolicy(true, ""));  // Exclude missing
4302 
4303 
4304     testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e", "a|X|b|cd|e"],
4305                                                  new MissingFieldPolicy(false, "X"));  // Replace missing
4306 }