tsv_utils.tsv_summarize source code

1 /**
2 Command line tool that reads TSV files and summarizes field values associated with
3 equivalent keys.
4 
5 Copyright (c) 2016-2020, eBay Inc.
6 Initially written by Jon Degenhardt
7 
8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
9 */
10 module tsv_utils.tsv_summarize;
11 
12 import std.algorithm : all, any, canFind, each, find, findSplit, map, joiner, splitter;
13 import std.array : join;
14 import std.conv : to;
15 import std.exception : enforce;
16 import std.format : format;
17 import std.range;
18 import std.stdio;
19 import std.typecons : tuple;
20 import std.container : DList;
21 
22 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
23 
24 version(unittest)
25 {
26     // When running unit tests, use main from -main compiler switch.
27 }
28 else
29 {
30     int main(string[] cmdArgs)
31     {
32         /* When running in DMD code coverage mode, turn on report merging. */
33         version(D_Coverage) version(DigitalMars)
34         {
35             import core.runtime : dmd_coverSetMerge;
36             dmd_coverSetMerge(true);
37         }
38 
39         TsvSummarizeOptions cmdopt;
40         auto r = cmdopt.processArgs(cmdArgs);
41         if (!r[0]) return r[1];
42         version(LDC_Profile)
43         {
44             import ldc.profile : resetAll;
45             resetAll();
46         }
47         try tsvSummarize(cmdopt);
48         catch (Exception exc)
49         {
50             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
51             return 1;
52         }
53         return 0;
54     }
55 }
56 
57 auto helpTextVerbose = q"EOS
58 Synopsis: tsv-summarize [options] file [file...]
59 
60 tsv-summarize reads tabular data files (tab-separated by default), tracks
61 field values for each unique key, and runs summarization algorithms. Consider
62 the file data.tsv:
63 
64    make    color   time
65    ford    blue    131
66    chevy   green   124
67    ford    red     128
68    bmw     black   118
69    bmw     black   126
70    ford    blue    122
71 
72 The min and average times for each make is generated by the command:
73 
74    $ tsv-summarize --header --group-by 1 --min 3 --mean 3 data.tsv
75 
76 This produces:
77 
78    make   time_min time_mean
79    ford   122      127
80    chevy  124      124
81    bmw    118      122
82 
83 Using '--group 1,2' will group by both 'make' and 'color'. Omitting the
84 '--group-by' entirely summarizes fields for full file.
85 
86 The program tries to generate useful headers, but custom headers can be
87 specified. Example (using -g and -H shortcuts for --header and --group-by):
88 
89    $ tsv-summarize -H -g 1 --min 3:fastest --mean 3:average data.tsv
90 
91 Most operators take custom headers in a similarly way, generally following:
92 
93   --<operator-name> FIELD[:header]
94 
95 Operators can be specified multiple times. They can also take multiple
96 fields (though not when a custom header is specified). Examples:
97 
98   --median 2,3,4
99   --median 2-5,7-11
100 
101 The quantile operator requires one or more probabilities after the fields:
102 
103   --quantile 2:0.25                // Quantile 1 of field 2
104   --quantile 2-4:0.25,0.5,0.75     // Q1, Median, Q3 of fields 2, 3, 4
105 
106 Summarization operators available are:
107   count       range        mad            values
108   retain      sum          var            unique-values
109   first       mean         stddev         unique-count
110   last        median       mode           missing-count
111   min         quantile     mode-count     not-missing-count
112   max
113 
114 Calculated numeric values are printed to 12 significant digits by default.
115 This can be changed using the '--p|float-precision' option. If six or less
116 it sets the number of significant digits after the decimal point. If
117 greater than six it sets the total number of significant digits.
118 
119 Calculations hold onto the minimum data needed while reading data. A few
120 operations like median keep all data values in memory. These operations will
121 start to encounter performance issues as available memory becomes scarce. The
122 size that can be handled effectively is machine dependent, but often quite
123 large files can be handled.
124 
125 Operations requiring numeric entries will signal an error and terminate
126 processing if a non-numeric entry is found.
127 
128 Missing values are not treated specially by default, this can be changed
129 using the '--x|exclude-missing' or '--r|replace-missing' option. The former
130 turns off processing for missing values, the latter uses a replacement value.
131 
132 Options:
133 EOS";
134 
135 auto helpText = q"EOS
136 Synopsis: tsv-summarize [options] file [file...]
137 
138 tsv-summarize runs aggregation operations on fields in tab-separated value
139 files. Operations can be run against the full input data or grouped by key
140 fields. Use --help-verbose for more extensive help.
141 
142 Options:
143 EOS";
144 
145 /** Command line options - Container and processing. The processArgs method is used to
146  * process the command line.
147  */
148 struct TsvSummarizeOptions {
149     import tsv_utils.common.utils : byLineSourceRange, ByLineSourceRange;
150 
151     string programName;                // Program name
152     ByLineSourceRange!() inputSources; // Input Files
153     size_t[] keyFields;                // -g, --group-by
154     bool hasHeader = false;            // --header
155     bool writeHeader = false;          // -w, --write-header
156     char inputFieldDelimiter = '\t';   // --d|delimiter
157     char valuesDelimiter = '|';        // --v|values-delimiter
158     size_t floatPrecision = 12;        // --p|float-precision
159     bool excludeMissing = false;       // --x|exclude-missing
160     string missingValueReplacement;    // --r|replace-missing
161     bool helpVerbose = false;          // --help-verbose
162     bool versionWanted = false;        // --V|version
163     DList!Operator operators;          // Operators, in the order specified.
164     size_t endFieldIndex = 0;          // Derived value. Max field index used plus one.
165     MissingFieldPolicy globalMissingPolicy = new MissingFieldPolicy;   // Derived value.
166 
167     /* Returns a tuple. First value is true if command line arguments were successfully
168      * processed and execution should continue, or false if an error occurred or the user
169      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
170      *
171      * Returning true (execution continues) means args have been validated and derived
172      * values calculated. In addition, field indices have been converted to zero-based.
173      */
174     auto processArgs (ref string[] cmdArgs) {
175         import std.algorithm : any, each;
176         import std.getopt;
177         import std.path : baseName, stripExtension;
178         import std.typecons : Yes, No;
179         import tsv_utils.common.getopt_inorder;
180         import tsv_utils.common.utils :  makeFieldListOptionHandler;
181 
182         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
183 
184         try
185         {
186             arraySep = ",";    // Use comma to separate values in command line options
187             auto r = getoptInorder(
188                 cmdArgs,
189                 "help-verbose",       "              Print full help.", &helpVerbose,
190 
191                 std.getopt.config.caseSensitive,
192                 "V|version",          "              Print version information and exit.", &versionWanted,
193                 std.getopt.config.caseInsensitive,
194 
195                 "g|group-by",         "<field-list>  Fields to use as key.",
196                 keyFields.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex),
197 
198                 std.getopt.config.caseSensitive,
199                 "H|header",           "              Treat the first line of each file as a header.", &hasHeader,
200                 std.getopt.config.caseInsensitive,
201 
202                 "w|write-header",     "              Write an output header even if there is no input header.", &writeHeader,
203                 "d|delimiter",        "CHR           Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &inputFieldDelimiter,
204                 "v|values-delimiter", "CHR           Values delimiter. Default: vertical bar (|). (Single byte UTF-8 characters only.)", &valuesDelimiter,
205                 "p|float-precision",  "NUM           'Precision' to use printing floating point numbers. Affects the number of digits printed and exponent use. Default: 12", &floatPrecision,
206                 "x|exclude-missing",  "              Exclude missing (empty) fields from calculations.", &excludeMissing,
207                 "r|replace-missing",  "STR           Replace missing (empty) fields with STR in calculations.", &missingValueReplacement,
208                 "count",              "              Count occurrences of each unique key ('--g|group-by'), or the total number of records if no key field is specified.", &countOptionHandler,
209                 "count-header",       "STR           Count occurrences of each unique key, like '--count', but use STR as the header.", &countHeaderOptionHandler,
210                 "retain",             "<field-list>  Retain one copy of the field.", &operatorOptionHandler!RetainOperator,
211                 "first",              "<field-list>[:STR]  First value seen.", &operatorOptionHandler!FirstOperator,
212                 "last",               "<field-list>[:STR]  Last value seen.", &operatorOptionHandler!LastOperator,
213                 "min",                "<field-list>[:STR]  Min value. (Numeric fields only.)", &operatorOptionHandler!MinOperator,
214                 "max",                "<field-list>[:STR]  Max value. (Numeric fields only.)", &operatorOptionHandler!MaxOperator,
215                 "range",              "<field-list>[:STR]  Difference between min and max values. (Numeric fields only.)", &operatorOptionHandler!RangeOperator,
216                 "sum",                "<field-list>[:STR]  Sum of the values. (Numeric fields only.)", &operatorOptionHandler!SumOperator,
217                 "mean",               "<field-list>[:STR]  Mean (average). (Numeric fields only.)", &operatorOptionHandler!MeanOperator,
218                 "median",             "<field-list>[:STR]  Median value. (Numeric fields only. Reads all values into memory.)", &operatorOptionHandler!MedianOperator,
219                 "quantile",           "<field-list>:p[,p...][:STR]  Quantiles. One or more fields, then one or more 0.0-1.0 probabilities. (Numeric fields only. Reads all values into memory.)", &quantileOperatorOptionHandler,
220                 "mad",                "<field-list>[:STR]  Median absolute deviation from the median. Raw value, not scaled. (Numeric fields only. Reads all values into memory.)", &operatorOptionHandler!MadOperator,
221                 "var",                "<field-list>[:STR]  Variance. (Sample variance, numeric fields only).", &operatorOptionHandler!VarianceOperator,
222                 "stdev",              "<field-list>[:STR]  Standard deviation. (Sample st.dev, numeric fields only).", &operatorOptionHandler!StDevOperator,
223                 "mode",               "<field-list>[:STR]  Mode. The most frequent value. (Reads all unique values into memory.)", &operatorOptionHandler!ModeOperator,
224                 "mode-count",         "<field-list>[:STR]  Count of the most frequent value. (Reads all unique values into memory.)", &operatorOptionHandler!ModeCountOperator,
225                 "unique-count",       "<field-list>[:STR]  Number of unique values. (Reads all unique values into memory.)", &operatorOptionHandler!UniqueCountOperator,
226                 "missing-count",      "<field-list>[:STR]  Number of missing (empty) fields. Not affected by '--x|exclude-missing' or '--r|replace-missing'.", &operatorOptionHandler!MissingCountOperator,
227                 "not-missing-count",  "<field-list>[:STR]  Number of filled (non-empty) fields. Not affected by '--r|replace-missing'.", &operatorOptionHandler!NotMissingCountOperator,
228                 "values",             "<field-list>[:STR]  All the values, separated by --v|values-delimiter. (Reads all values into memory.)", &operatorOptionHandler!ValuesOperator,
229                 "unique-values",      "<field-list>[:STR]  All the unique values, separated by --v|values-delimiter. (Reads all unique values into memory.)", &operatorOptionHandler!UniqueValuesOperator,
230                 );
231 
232             if (r.helpWanted)
233             {
234                 defaultGetoptPrinter(helpText, r.options);
235                 return tuple(false, 0);
236             }
237             else if (helpVerbose)
238             {
239                 defaultGetoptPrinter(helpTextVerbose, r.options);
240                 return tuple(false, 0);
241             }
242             else if (versionWanted)
243             {
244                 import tsv_utils.common.tsvutils_version;
245                 writeln(tsvutilsVersionNotice("tsv-summarize"));
246                 return tuple(false, 0);
247             }
248 
249             consistencyValidations();
250 
251             /* Remaining command line args are files. Use standard input if files
252              * were not provided. Truncate cmdArgs to consume the arguments.
253              */
254             string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"];
255             cmdArgs.length = 1;
256             inputSources = byLineSourceRange(filepaths);
257 
258             derivations();
259         }
260         catch (Exception exc)
261         {
262             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
263             return tuple(false, 1);
264         }
265         return tuple(true, 0);
266     }
267 
268     /* operationOptionHandler functions are callbacks that process command line options
269      * specifying summarization operations. eg. '--max 5', '--last 3:LastEntry'. Handlers
270      * check syntactic correctness and instantiate Operator objects that do the work. This
271      * is also where 1-upped field numbers are converted to 0-based indices.
272      */
273     private void operatorOptionHandler(OperatorClass : SingleFieldOperator)(string option, string optionVal)
274     {
275         import std.range : enumerate;
276         import std.typecons : Yes, No;
277         import tsv_utils.common.utils :  parseFieldList;
278 
279         auto valSplit = findSplit(optionVal, ":");
280 
281         enforce(!valSplit[0].empty && (valSplit[1].empty || !valSplit[2].empty),
282                 format("Invalid option value: '--%s %s'. Expected: '--%s <field-list>' or '--%s <field>:<header>'.",
283                        option, optionVal, option, option));
284 
285         try foreach (fieldNum, fieldIndex;
286                      valSplit[0].to!string
287                      .parseFieldList!(size_t, Yes.convertToZeroBasedIndex).enumerate(1))
288             {
289                 auto op = new OperatorClass(fieldIndex, globalMissingPolicy);
290 
291                 if (!valSplit[2].empty) // Header specified
292                 {
293                     enforce(fieldNum <= 1,
294                             format("Invalid option: '--%s %s'. Cannot specify a custom header when using multiple fields.",
295                                    option, optionVal));
296 
297                     enforce(op.allowCustomHeader,
298                             format("Invalid option: '--%s %s'. Operator does not support custom headers.",
299                                    option, optionVal));
300 
301                     op.setCustomHeader(valSplit[2].to!string);
302                 }
303 
304                 operators.insertBack(op);
305                 if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1;
306             }
307         catch (Exception exc)
308         {
309             import std.format : format;
310             exc.msg = format("[--%s] %s", option, exc.msg);
311             throw exc;
312         }
313     }
314 
315     /* QuantileOperator has a different syntax and needs a custom command option handler. */
316     private void quantileOperatorOptionHandler(string option, string optionVal)
317     {
318         import std.typecons : Yes, No;
319         import tsv_utils.common.utils :  parseFieldList;
320 
321         auto formatErrorMsg(string option, string optionVal)
322         {
323             return format(
324                 "Invalid option value: '--%s %s'. Expected: '--%s <field-list>:<prob>[,<prob>]' or '--%s <field>:<prob>:<header>' where <prob> is a number between 0.0 and 1.0.",
325                 option, optionVal, option, option);
326         }
327 
328         auto split1 = findSplit(optionVal, ":");
329 
330         enforce(!split1[0].empty && (split1[1].empty || !split1[2].empty),
331                 formatErrorMsg(option, optionVal));
332 
333         auto split2 = findSplit(split1[2], ":");
334 
335         enforce(!split2[0].empty && (split2[1].empty || !split2[2].empty),
336                 formatErrorMsg(option, optionVal));
337 
338         auto fieldStr = split1[0];
339         auto probStr = split2[0];
340         auto header = split2[2];
341 
342         size_t[] fieldIndices;
343         double[] probs;
344 
345         try foreach (fieldIndex;
346                      fieldStr.to!string.parseFieldList!(size_t, Yes.convertToZeroBasedIndex))
347             {
348                 fieldIndices ~= fieldIndex;
349             }
350         catch (Exception exc)
351         {
352             import std.format : format;
353             exc.msg = format("[--%s] %s", option, exc.msg);
354             throw exc;
355         }
356 
357         foreach (str; probStr.splitter(','))
358         {
359             double p;
360 
361             try p = str.to!double;
362             catch (Exception exc)
363                 throw new Exception(formatErrorMsg(option, optionVal));
364 
365             enforce(p >= 0.0 && p <= 1.0,
366                     format("Invalid option: '--%s %s'. Probability '%g' is not in the interval [0.0,1.0].",
367                            option, optionVal, p));
368 
369             probs ~= p;
370         }
371 
372         enforce(header.empty || (fieldIndices.length <= 1 && probs.length <= 1),
373                 format("Invalid option: '--%s %s'. Cannot specify a custom header when using multiple fields or multiple probabilities.",
374                        option, optionVal));
375 
376         assert (fieldIndices.length > 0);
377         assert (probs.length > 0);
378         assert (header.empty || (fieldIndices.length == 1 && probs.length == 1));
379 
380         foreach (fieldIndex; fieldIndices)
381         {
382             foreach (p; probs)
383             {
384                 auto op = new QuantileOperator(fieldIndex, globalMissingPolicy, p);
385                 if (!header.empty) op.setCustomHeader(header);
386                 operators.insertBack(op);
387             }
388             if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1;
389         }
390     }
391 
392     private void countOptionHandler()
393     {
394         operators.insertBack(new CountOperator());
395     }
396 
397     private void countHeaderOptionHandler(string option, string optionVal)
398     {
399         auto op = new CountOperator();
400         op.setCustomHeader(optionVal);
401         operators.insertBack(op);
402     }
403 
404     /* This routine does validations not handled by processArgs. */
405     private void consistencyValidations()
406     {
407         enforce(!operators.empty, "At least one summary operator is required.");
408 
409         enforce(inputFieldDelimiter != valuesDelimiter,
410                 "Cannot use the same character for both --d|field-delimiter and --v|values-delimiter.");
411 
412         enforce(!(excludeMissing && missingValueReplacement.length != 0),
413                 "Cannot use both '--x|exclude-missing' and '--r|replace-missing'.");
414     }
415 
416     /* Post-processing derivations. */
417     void derivations()
418     {
419         /* keyFields need to part of the endFieldIndex, which is one past the last field index. */
420         keyFields.each!(delegate (size_t x) { if (x >= endFieldIndex) endFieldIndex = x + 1; } );
421 
422         /* Missing field policy. */
423         globalMissingPolicy.updatePolicy(excludeMissing, missingValueReplacement);
424     }
425 }
426 
427 /** tsvSummarize does the primary work of the tsv-summarize program.
428  */
429 void tsvSummarize(ref TsvSummarizeOptions cmdopt)
430 {
431     import tsv_utils.common.utils : ByLineSourceRange, bufferedByLine,
432         throwIfWindowsNewlineOnUnix;
433 
434     /* Check that the input files were setup as expected. Should at least have one
435      * input, stdin if nothing else, and newlines removed from the byLine range.
436      */
437     assert(!cmdopt.inputSources.empty);
438     static assert(is(typeof(cmdopt.inputSources) == ByLineSourceRange!(No.keepTerminator)));
439 
440     /* Pick the Summarizer based on the number of key-fields entered. */
441     auto summarizer =
442         (cmdopt.keyFields.length == 0)
443         ? new NoKeySummarizer!(typeof(stdout.lockingTextWriter()))(
444             cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
445 
446         : (cmdopt.keyFields.length == 1)
447         ? new OneKeySummarizer!(typeof(stdout.lockingTextWriter()))(
448             cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
449 
450         : new MultiKeySummarizer!(typeof(stdout.lockingTextWriter()))(
451             cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy);
452 
453     /* Add the operators to the Summarizer. */
454     summarizer.setOperators(inputRangeObject(cmdopt.operators[]));
455 
456     /* Process each input file, one line at a time. */
457     auto lineFields = new char[][](cmdopt.endFieldIndex);
458     bool headerFound = false;
459     foreach (inputStream; cmdopt.inputSources)
460     {
461         foreach (lineNum, line; inputStream.byLine.enumerate(1))
462         {
463             if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, lineNum);
464 
465             /* Copy the needed number of fields to the fields array.
466              * Note: The number is zero if no operator needs fields. Notably, the count
467              * operator. Used by itself, it counts the number input lines (ala 'wc -l').
468              */
469             if (cmdopt.endFieldIndex > 0)
470             {
471                 size_t fieldIndex = 0;
472                 foreach (fieldValue; line.splitter(cmdopt.inputFieldDelimiter))
473                 {
474                     if (fieldIndex == cmdopt.endFieldIndex) break;
475                     lineFields[fieldIndex] = fieldValue;
476                     fieldIndex++;
477                 }
478 
479                 if (fieldIndex == 0)
480                 {
481                     assert(cmdopt.endFieldIndex > 0);
482                     assert(line.length == 0);
483 
484                     /* Bug work-around. Empty lines are not handled properly by splitter.
485                      *   - Bug: https://issues.dlang.org/show_bug.cgi?id=15735
486                      *   - Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030
487                      * This can arise for: '$ tsv-summarize -g 1 --count'. This counts the
488                      * unique values in field 1. If there's only one column, then an empty
489                      * line becomes an empty string for field 1. Work-around: Point to the
490                      * line. It's an empty string.
491                      */
492                     lineFields[fieldIndex] = line;
493                     fieldIndex++;
494                 }
495 
496                 enforce(fieldIndex >= cmdopt.endFieldIndex,
497                         format("Not enough fields in line. File: %s, Line: %s",
498                                inputStream.name, lineNum));
499             }
500 
501             if (cmdopt.hasHeader && lineNum == 1)
502             {
503                 if (!headerFound)
504                 {
505                     summarizer.processHeaderLine(lineFields);
506                     headerFound = true;
507                 }
508             }
509             else
510             {
511                 /* Process the line. Processing will fail (throw) if a field cannot be
512                  * converted to the expected type.
513                  */
514                 try summarizer.processNextLine(lineFields);
515                 catch (Exception exc)
516                 {
517                     throw new Exception(
518                         format("Could not process line or field: %s\n  File: %s Line: %s%s",
519                                exc.msg, inputStream.name, lineNum,
520                                (lineNum == 1) ? "\n  Is this a header line? Use --header to skip." : ""));
521                 }
522             }
523         }
524     }
525 
526     debug writeln("[tsvSummarize] After reading all data.");
527 
528     /* Whew! We're done processing input data. Run the calculations and print. */
529     auto printOptions = SummarizerPrintOptions(
530         cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision);
531     auto stdoutWriter = stdout.lockingTextWriter;
532 
533     if (cmdopt.hasHeader || cmdopt.writeHeader)
534     {
535         summarizer.writeSummaryHeader(stdoutWriter, printOptions);
536     }
537 
538     summarizer.writeSummaryBody(stdoutWriter, printOptions);
539 }
540 
541 /** The default field header. This is used when the input doesn't have field headers,
542  * but field headers are used in the output. The default is "fieldN", where N is the
543  * 1-upped field number.
544  */
545 string fieldHeaderFromIndex(size_t fieldIndex)
546 {
547     enum prefix = "field";
548     return prefix ~ (fieldIndex + 1).to!string;
549 }
550 
551 unittest
552 {
553     assert(fieldHeaderFromIndex(0) == "field1");
554     assert(fieldHeaderFromIndex(10) == "field11");
555 }
556 
557 /** Produce a summary header from a field header.
558  *
559  * The result has the form `<fieldHeader>_<operation>`. e.g. If the field header is
560  * "length" and the operation is "max", the summary header is "length_max". The field
561  * header typically comes a header line in the input data or was constructed by
562  * fieldHeaderFromIndex().
563  *
564  * If operationName is the empty string, then fieldHeader is used unchanged. This supports
565  * the Retain operator.
566  */
567 string summaryHeaderFromFieldHeader(string fieldHeader, string operationName)
568 {
569     return (operationName.length > 0) ? fieldHeader ~ "_" ~ operationName : fieldHeader;
570 }
571 
572 unittest
573 {
574     assert(summaryHeaderFromFieldHeader("originalfield", "mycalc") == "originalfield_mycalc");
575     assert(summaryHeaderFromFieldHeader("originalfield", "") == "originalfield");
576 }
577 
578 /** SummarizerPrintOptions holds printing options for Summarizers and Calculators. Typically
579  * specified with command line options, it is separated out for modularity.
580  */
581 struct SummarizerPrintOptions
582 {
583     char fieldDelimiter;
584     char valuesDelimiter;
585     size_t floatPrecision = 12;
586 
587     import std.traits : isFloatingPoint, isIntegral;
588 
589     auto formatNumber(T)(T n) const
590     if (isFloatingPoint!T || isIntegral!T)
591     {
592         import tsv_utils.common.numerics : formatNumber;
593         return formatNumber!T(n, floatPrecision);
594     }
595 }
596 
597 /** A Summarizer object maintains the state of the summarization and performs basic
598  * processing. Handling of files and input lines is left to the caller.
599  *
600  * Classes supporting the Summarizer must implement the methods:
601  *  - setOperators - Called after initializing the object for each operator to be processed.
602  *  - processHeaderLine - Called to process the header line of each file. Returns true if
603  *   it was the first header line processed (used when reading multiple files).
604  * - processNextLine - Called to process non-header lines.
605  * - writeSummaryHeader - Called to write the header line.
606  * - writeSummaryBody - Called to write the result lines.
607  *
608  */
609 interface Summarizer(OutputRange)
610 {
611     /** Called after initializing the object for each operator to be processed. */
612     void setOperators(InputRange!Operator op);
613 
614     /** Called to process the header line of each file. Returns true if it was the
615      *  first header line processed (used when reading multiple files).
616      */
617     bool processHeaderLine(const char[][] lineFields);
618 
619     /** Called to process non-header lines. */
620     void processNextLine(const char[][] lineFields);
621 
622     /** Called to write the header line. */
623     void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions);
624 
625     /** Called to write the result lines. */
626     void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions);
627 }
628 
629 /** SummarizerBase performs work shared by all sumarizers, most everything except for
630  * handling of unique keys.
631  *
632  * The base class handles creation, allocates storage for Operators and SharedFieldValues,
633  * and similar. Derived classes deal primarily with unique keys and the associated Calculators
634  * and UniqueKeyValuesLists.
635  */
636 class SummarizerBase(OutputRange) : Summarizer!OutputRange
637 {
638     private char _inputFieldDelimiter;
639     private bool _hasProcessedFirstHeaderLine = false;
640     private SharedFieldValues _sharedFieldValues = null;  // Null if no shared field value lists.
641     protected MissingFieldPolicy _missingPolicy;
642     protected DList!Operator _operators;
643     protected size_t _numOperators = 0;
644 
645     this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
646     {
647         _inputFieldDelimiter = inputFieldDelimiter;
648         _missingPolicy = missingPolicy;
649     }
650 
651     char inputFieldDelimiter() const @property
652     {
653         return _inputFieldDelimiter;
654     }
655 
656     /** Sets the Operators used by the Summarizer. Called after construction. */
657     void setOperators(InputRange!Operator operators)
658     {
659         foreach (op; operators)
660         {
661             _operators.insertBack(op);
662             _numOperators++;
663             auto numericFieldsToSave = op.numericFieldsToSave();
664             auto textFieldsToSave = op.textFieldsToSave();
665 
666             if (numericFieldsToSave.length > 0 || textFieldsToSave.length > 0)
667             {
668                 if (_sharedFieldValues is null)
669                 {
670                     _sharedFieldValues = new SharedFieldValues();
671                 }
672                 numericFieldsToSave.each!(x => _sharedFieldValues.addNumericIndex(x));
673                 textFieldsToSave.each!(x => _sharedFieldValues.addTextIndex(x));
674             }
675         }
676     }
677 
678     /** Called to process the header line of each file. Returns true if it was the
679      *  first header line processed (used when reading multiple files).
680      */
681     bool processHeaderLine(const char[][] lineFields)
682     {
683         if (!_hasProcessedFirstHeaderLine)
684         {
685             _operators.each!(x => x.processHeaderLine(lineFields));
686             _hasProcessedFirstHeaderLine = true;
687             return true;
688         }
689         else
690         {
691             return false;
692         }
693     }
694 
695     protected final UniqueKeyValuesLists makeUniqueKeyValuesLists()
696     {
697         return (_sharedFieldValues is null)
698             ? null
699             : _sharedFieldValues.makeUniqueKeyValuesLists;
700     }
701 
702     abstract void processNextLine(const char[][] lineFields);
703     abstract void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions);
704     abstract void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions);
705 }
706 
707 /** The NoKeySummarizer is used when summarizing values across the entire input.
708  *
709  * Note: NoKeySummarizer is used in Operator unit tests and gets extensive testing
710  * through that mechanism.
711  */
712 final class NoKeySummarizer(OutputRange) : SummarizerBase!OutputRange
713 {
714     private Calculator[] _calculators;
715     private UniqueKeyValuesLists _valueLists;
716 
717     this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
718     {
719         super(inputFieldDelimiter, missingPolicy);
720     }
721 
722     /** Called after initializing the object for each operator to be processed. */
723     override void setOperators(InputRange!Operator operators)
724     {
725         super.setOperators(operators);
726 
727         /* Only one Calculator per Operation, so create them as Operators are added. */
728         foreach (op; operators) _calculators ~= op.makeCalculator;
729         _valueLists = super.makeUniqueKeyValuesLists();
730     }
731 
732      /** Called to process non-header lines. */
733     override void processNextLine(const char[][] lineFields)
734     {
735         _calculators.each!(x => x.processNextLine(lineFields));
736         if (_valueLists !is null) _valueLists.processNextLine(lineFields, _missingPolicy);
737     }
738 
739     /** Called to write the header line. */
740     override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
741     {
742         put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter));
743         put(outputStream, '\n');
744     }
745 
746     /** Called to write the result lines. */
747     override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
748     {
749         put(outputStream,
750             _calculators[]
751             .map!(x => x.calculate(_valueLists, printOptions))
752             .join(printOptions.fieldDelimiter));
753         put(outputStream, '\n');
754     }
755 }
756 
757 /** KeySummarizerBase does work shared by the single key and multi-key summarizers.
758  *
759  * The primary difference between those two is the formation of the key. The primary
760  * reason for separating those into two separate classes is to simplify (speed-up)
761  * handling of single field keys, which are the most common use case.
762  */
763 class KeySummarizerBase(OutputRange) : SummarizerBase!OutputRange
764 {
765     protected struct UniqueKeyData
766     {
767         Calculator[] calculators;
768         UniqueKeyValuesLists valuesLists;
769     }
770 
771     private DList!string _uniqueKeys;
772     private UniqueKeyData[string] _uniqueKeyData;
773 
774     this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
775     {
776         super(inputFieldDelimiter, missingPolicy);
777     }
778 
779     protected void processNextLineWithKey(T : const char[])(T key, const char[][] lineFields)
780     {
781         debug writefln("[%s]: %s", __FUNCTION__, lineFields.to!string);
782 
783         auto dataPtr = (key in _uniqueKeyData);
784         auto data = (dataPtr is null) ? addUniqueKey(key.to!string) : *dataPtr;
785 
786         data.calculators.each!(x => x.processNextLine(lineFields));
787         if (data.valuesLists !is null) data.valuesLists.processNextLine(lineFields, _missingPolicy);
788     }
789 
790     protected UniqueKeyData addUniqueKey(string key)
791     {
792         assert(key !in _uniqueKeyData);
793 
794         _uniqueKeys.insertBack(key);
795 
796         auto calculators = new Calculator[_numOperators];
797         size_t i = 0;
798         foreach (op; _operators)
799         {
800             calculators[i] = op.makeCalculator;
801             i++;
802         }
803 
804         return _uniqueKeyData[key] = UniqueKeyData(calculators, super.makeUniqueKeyValuesLists());
805     }
806 
807     override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
808     {
809         put(outputStream, keyFieldHeader());
810         put(outputStream, printOptions.fieldDelimiter);
811         put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter));
812         put(outputStream, '\n');
813     }
814 
815     override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
816     {
817         foreach(key; _uniqueKeys)
818         {
819             auto data = _uniqueKeyData[key];
820             put(outputStream, key);
821             put(outputStream, printOptions.fieldDelimiter);
822             put(outputStream,
823                 data.calculators[]
824                 .map!(x => x.calculate(data.valuesLists, printOptions))
825                 .join(printOptions.fieldDelimiter));
826             put(outputStream, '\n');
827         }
828     }
829 
830     abstract string keyFieldHeader() const @property;
831 }
832 
833 /** This Summarizer is for the case where the unique key is based on exactly one field.
834  */
835 final class OneKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange
836 {
837     private size_t _keyFieldIndex = 0;
838     private string _keyFieldHeader;
839     private DList!string _uniqueKeys;
840 
841     this(size_t keyFieldIndex, char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
842     {
843         super(inputFieldDelimiter, missingPolicy);
844         _keyFieldIndex = keyFieldIndex;
845         _keyFieldHeader = fieldHeaderFromIndex(keyFieldIndex);
846     }
847 
848     override string keyFieldHeader() const @property
849     {
850         return _keyFieldHeader;
851     }
852 
853     override bool processHeaderLine(const char[][] lineFields)
854     {
855         assert(_keyFieldIndex <= lineFields.length);
856 
857         bool isFirstHeaderLine = super.processHeaderLine(lineFields);
858         if (isFirstHeaderLine)
859         {
860             _keyFieldHeader = lineFields[_keyFieldIndex].to!string;
861         }
862         return isFirstHeaderLine;
863     }
864 
865     override void processNextLine(const char[][] lineFields)
866     {
867         assert(_keyFieldIndex < lineFields.length);
868         processNextLineWithKey(lineFields[_keyFieldIndex], lineFields);
869     }
870 }
871 
872 /** This Summarizer is for the case where the unique key is based on multiple fields.
873  */
874 final class MultiKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange
875 {
876     private size_t[] _keyFieldIndices;
877     private string _keyFieldHeader;
878     private DList!string _uniqueKeys;
879 
880     this(const size_t[] keyFieldIndices, char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
881     {
882         super(inputFieldDelimiter, missingPolicy);
883         _keyFieldIndices = keyFieldIndices.dup;
884         _keyFieldHeader =
885             _keyFieldIndices.map!(i => fieldHeaderFromIndex(i))
886             .join(inputFieldDelimiter);
887     }
888 
889     override string keyFieldHeader() const @property
890     {
891         return _keyFieldHeader;
892     }
893 
894     override bool processHeaderLine(const char[][] lineFields)
895     {
896         assert(_keyFieldIndices.all!(x => x < lineFields.length));
897         assert(_keyFieldIndices.length >= 2);
898 
899         bool isFirstHeaderLine = super.processHeaderLine(lineFields);
900         if (isFirstHeaderLine)
901         {
902             _keyFieldHeader = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string;
903         }
904         return isFirstHeaderLine;
905     }
906 
907     override void processNextLine(const char[][] lineFields)
908     {
909         assert(_keyFieldIndices.all!(x => x < lineFields.length));
910         assert(_keyFieldIndices.length >= 2);
911 
912         string key = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string;
913         processNextLineWithKey(key, lineFields);
914     }
915 }
916 
917 version(unittest)
918 {
919     /* testSummarizer is a helper that can run many types of unit tests against
920      * Summarizers. It can also test operators, but there are separate helper functions
921      * better suited for that purpose.
922      *
923      * Arguments are a command line args, an input file, and expected output. The
924      * input file and expected output are already split into lines and fields, the helper
925      * manages re-assembly. The program name from the command line args is printed if an
926      * an error occurs, it is useful to identify the test that failed.
927      *
928      * Note: Much of this is a duplication tsvSummarize logic. Better abstraction of
929      * file input/output would enable running unit tests directly on top of tsvSummarize.
930      *
931      * Update (April 2020): With the introduction of InputSourceRange and ByLineSource,
932      * there needs to be a physical file when call processArgs. Its hard to get around,
933      * as the intent is to read the header line of the first input file during command
934      * line argument processing. Eventually this unit test process will need to be
935      * rewritten. For now, a file with the equivalent data is being added to the command
936      * line.
937      */
938     void testSummarizer(string[] cmdArgs, string[][] file, string[][] expected)
939     {
940         import std.array : appender;
941 
942         assert(cmdArgs.length > 0, "[testSummarizer] cmdArgs must not be empty.");
943 
944         auto formatAssertMessage(T...)(string msg, T formatArgs)
945         {
946             auto formatString = "[testSummarizer] %s: " ~ msg;
947             return format(formatString, cmdArgs[0], formatArgs);
948         }
949 
950         TsvSummarizeOptions cmdopt;
951         auto savedCmdArgs = cmdArgs.to!string;
952         auto r = cmdopt.processArgs(cmdArgs);
953         assert(r[0], formatAssertMessage("Invalid command line args: '%s'.", savedCmdArgs));
954 
955         assert(file.all!(line => line.length >= cmdopt.endFieldIndex),
956                formatAssertMessage("group-by or operator field number greater than number of fields a line of the input file."));
957 
958         /* Pick the Summarizer based on the number of key-fields entered. */
959         auto summarizer =
960             (cmdopt.keyFields.length == 0)
961             ? new NoKeySummarizer!(typeof(appender!(char[])()))(
962                 cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
963 
964             : (cmdopt.keyFields.length == 1)
965             ? new OneKeySummarizer!(typeof(appender!(char[])()))(
966                 cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
967 
968             : new MultiKeySummarizer!(typeof(appender!(char[])()))(
969                 cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy);
970 
971         /* Add the operators to the Summarizer. */
972         summarizer.setOperators(inputRangeObject(cmdopt.operators[]));
973 
974         /* Process the file one line at a time. */
975         auto lineFields = new char[][](cmdopt.endFieldIndex);
976         bool headerFound = false;
977         foreach (lineNum, line; file.enumerate(1))
978         {
979             /* Copy the needed fields to the fields array. */
980             foreach (i, val; line[0..cmdopt.endFieldIndex]) lineFields[i] = val.dup;
981 
982             if (cmdopt.hasHeader && lineNum == 1)
983             {
984                 if (!headerFound)
985                 {
986                     summarizer.processHeaderLine(lineFields);
987                     headerFound = true;
988                 }
989             }
990             else
991             {
992                 try summarizer.processNextLine(lineFields);
993                 catch (Exception exc)
994                 {
995                     assert(false, formatAssertMessage(exc.msg));
996                 }
997             }
998         }
999         auto printOptions = SummarizerPrintOptions(
1000         cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision);
1001 
1002         auto summarizerOutput = appender!(char[])();
1003 
1004         if (cmdopt.hasHeader || cmdopt.writeHeader)
1005         {
1006             summarizer.writeSummaryHeader(summarizerOutput, printOptions);
1007         }
1008 
1009         summarizer.writeSummaryBody(summarizerOutput, printOptions);
1010         auto expectedOutput = expected.map!(x => x.joiner(cmdopt.inputFieldDelimiter.to!string)).joiner("\n").to!string;
1011         if (expectedOutput.length > 0 && expectedOutput[$-1] != '\n') expectedOutput ~= "\n";
1012 
1013         assert(summarizerOutput.data == expectedOutput,
1014                formatAssertMessage(
1015                    "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
1016                    expectedOutput.to!string, summarizerOutput.data.to!string));
1017     }
1018 
1019     void writeDataFile(string filepath, string[][] fileData)
1020     {
1021         import std.algorithm;
1022         import std.stdio;
1023 
1024         auto f = filepath.File("w");
1025         foreach (record; fileData) f.writeln(record.joiner("\t"));
1026         f.close;
1027     }
1028 }
1029 
1030 unittest
1031 {
1032     import tsv_utils.common.unittest_utils;   // tsv unit test helpers, from common/src/.
1033     import std.file : mkdir, rmdirRecurse;
1034     import std.path : buildPath;
1035 
1036     auto testDir = makeUnittestTempDir("tsv_summarizer");
1037     scope(exit) testDir.rmdirRecurse;
1038 
1039     /* Summarizer unit tests. Primarily single-key and multi-key summarizers. To a limited
1040      * extent, command line option handling (TsvSummarizeOptions). Individual operators
1041      * have separate tests, those tests test the no-key summarizer. The Values operator is
1042      * used in these tests. It engages a number of behaviors, and the results have limited
1043      * ambiguity. Using only one operator limits dependence on individual operators.
1044      *
1045      * Update (April 2020): There now needs to be a real file passed to testSummarizer.
1046      * See the comments with testSummarizer for details.
1047      */
1048 
1049     auto file1 = [["fld1", "fld2", "fld3"],
1050                   ["a", "a",  "3"],
1051                   ["c", "a",  "2b"],
1052                   ["c", "bc", ""],
1053                   ["a", "c",  "2b"],
1054                   ["",  "bc", ""],
1055                   ["c", "bc", "3"]];
1056 
1057     auto file1Path = buildPath(testDir, "file1.tsv");
1058     auto file1NoHeaderPath = buildPath(testDir, "file1_noheader.tsv");
1059     writeDataFile(file1Path, file1);
1060     writeDataFile(file1NoHeaderPath, file1[1 .. $]);
1061 
1062     /* Single-key summarizer tests.
1063      */
1064     testSummarizer(["unittest-sk-1", "--header", "--group-by", "1", "--values", "1", file1Path],
1065                    file1,
1066                    [["fld1", "fld1_values"],
1067                     ["a", "a|a"],
1068                     ["c", "c|c|c"],
1069                     ["",  ""]]
1070         );
1071     testSummarizer(["unittest-sk-2", "-H", "--group-by", "1", "--values", "2", file1Path],
1072                    file1,
1073                    [["fld1", "fld2_values"],
1074                     ["a", "a|c"],
1075                     ["c", "a|bc|bc"],
1076                     ["",  "bc"]]
1077         );
1078     testSummarizer(["unittest-sk-3", "-H", "-g", "1", "--values", "3", file1Path],
1079                    file1,
1080                    [["fld1", "fld3_values"],
1081                     ["a", "3|2b"],
1082                     ["c", "2b||3"],
1083                     ["",  ""]]
1084         );
1085     testSummarizer(["unittest-sk-4", "-H", "--group-by", "1", "--values", "1,2,3", file1Path],
1086                    file1,
1087                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1088                     ["a", "a|a",   "a|c",     "3|2b"],
1089                     ["c", "c|c|c", "a|bc|bc", "2b||3"],
1090                     ["",  "",      "bc",      ""]]
1091         );
1092     testSummarizer(["unittest-sk-5", "-H", "--group-by", "1", "--values", "1-3", file1Path],
1093                    file1,
1094                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1095                     ["a", "a|a",   "a|c",     "3|2b"],
1096                     ["c", "c|c|c", "a|bc|bc", "2b||3"],
1097                     ["",  "",      "bc",      ""]]
1098         );
1099     testSummarizer(["unittest-sk-6", "-H", "--group-by", "1", "--values", "3,2,1", file1Path],
1100                    file1,
1101                    [["fld1", "fld3_values", "fld2_values", "fld1_values"],
1102                     ["a", "3|2b",  "a|c",     "a|a"],
1103                     ["c", "2b||3", "a|bc|bc", "c|c|c"],
1104                     ["",  "",      "bc",      ""]]
1105         );
1106     testSummarizer(["unittest-sk-7", "-H", "--group-by", "1", "--values", "3-1", file1Path],
1107                    file1,
1108                    [["fld1", "fld3_values", "fld2_values", "fld1_values"],
1109                     ["a", "3|2b",  "a|c",     "a|a"],
1110                     ["c", "2b||3", "a|bc|bc", "c|c|c"],
1111                     ["",  "",      "bc",      ""]]
1112         );
1113     testSummarizer(["unittest-sk-8", "-H", "--group-by", "2", "--values", "1", file1Path],
1114                    file1,
1115                    [["fld2", "fld1_values"],
1116                     ["a",  "a|c"],
1117                     ["bc", "c||c"],
1118                     ["c",  "a"]]
1119         );
1120     testSummarizer(["unittest-sk-9", "-H", "--group-by", "2", "--values", "2", file1Path],
1121                    file1,
1122                    [["fld2", "fld2_values"],
1123                     ["a",  "a|a"],
1124                     ["bc", "bc|bc|bc"],
1125                     ["c",  "c"]]
1126         );
1127     testSummarizer(["unittest-sk-10", "-H", "--group-by", "2", "--values", "3", file1Path],
1128                    file1,
1129                    [["fld2", "fld3_values"],
1130                     ["a",  "3|2b"],
1131                     ["bc", "||3"],
1132                     ["c",  "2b"]]
1133         );
1134     testSummarizer(["unittest-sk-11", "-H", "--group-by", "2", "--values", "1,3", file1Path],
1135                    file1,
1136                    [["fld2", "fld1_values", "fld3_values"],
1137                     ["a",  "a|c",  "3|2b"],
1138                     ["bc", "c||c", "||3"],
1139                     ["c",  "a",    "2b"]]
1140         );
1141     testSummarizer(["unittest-sk-12", "-H", "--group-by", "2", "--values", "3,1", file1Path],
1142                    file1,
1143                    [["fld2", "fld3_values", "fld1_values"],
1144                     ["a",  "3|2b", "a|c"],
1145                     ["bc", "||3",  "c||c"],
1146                     ["c",  "2b",   "a"]]
1147         );
1148     testSummarizer(["unittest-sk-13", "-H", "--group-by", "3", "--values", "1", file1Path],
1149                    file1,
1150                    [["fld3", "fld1_values"],
1151                     ["3",  "a|c"],
1152                     ["2b", "c|a"],
1153                     ["",   "c|"]]
1154         );
1155     testSummarizer(["unittest-sk-14", "-H", "--group-by", "3", "--values", "2", file1Path],
1156                    file1,
1157                    [["fld3", "fld2_values"],
1158                     ["3",  "a|bc"],
1159                     ["2b", "a|c"],
1160                     ["",   "bc|bc"]]
1161         );
1162     testSummarizer(["unittest-sk-15", "-H", "--group-by", "3", "--values", "1,2", file1Path],
1163                    file1,
1164                    [["fld3", "fld1_values", "fld2_values"],
1165                     ["3",  "a|c", "a|bc"],
1166                     ["2b", "c|a", "a|c"],
1167                     ["",   "c|",  "bc|bc"]]
1168         );
1169 
1170     /* Multi-key summarizer tests.
1171      */
1172     testSummarizer(["unittest-mk-1", "--header", "--group-by", "1,2", "--values", "1", file1Path],
1173                    file1,
1174                    [["fld1", "fld2", "fld1_values"],
1175                     ["a", "a",  "a"],
1176                     ["c", "a",  "c"],
1177                     ["c", "bc", "c|c"],
1178                     ["a", "c",  "a"],
1179                     ["", "bc",  ""]]
1180         );
1181     testSummarizer(["unittest-mk-2", "-H", "--group-by", "1,2", "--values", "2", file1Path],
1182                    file1,
1183                    [["fld1", "fld2", "fld2_values"],
1184                     ["a", "a",  "a"],
1185                     ["c", "a",  "a"],
1186                     ["c", "bc", "bc|bc"],
1187                     ["a", "c",  "c"],
1188                     ["", "bc",  "bc"]]
1189         );
1190     testSummarizer(["unittest-mk-3", "-H", "--group-by", "1,2", "--values", "3", file1Path],
1191                    file1,
1192                    [["fld1", "fld2", "fld3_values"],
1193                     ["a", "a",  "3"],
1194                     ["c", "a",  "2b"],
1195                     ["c", "bc", "|3"],
1196                     ["a", "c",  "2b"],
1197                     ["", "bc",  ""]]
1198         );
1199     testSummarizer(["unittest-mk-4", "-H", "--group-by", "1,2", "--values", "3,1", file1Path],
1200                    file1,
1201                    [["fld1", "fld2", "fld3_values", "fld1_values"],
1202                     ["a", "a",  "3", "a"],
1203                     ["c", "a",  "2b", "c"],
1204                     ["c", "bc", "|3", "c|c"],
1205                     ["a", "c",  "2b", "a"],
1206                     ["",  "bc", "",   ""]]
1207         );
1208     testSummarizer(["unittest-mk-5", "-H", "--group-by", "3,2", "--values", "1", file1Path],
1209                    file1,
1210                    [["fld3", "fld2", "fld1_values"],
1211                     ["3",  "a",  "a"],
1212                     ["2b", "a",  "c"],
1213                     ["",   "bc", "c|"],
1214                     ["2b", "c",  "a"],
1215                     ["3",  "bc", "c"]]
1216         );
1217     testSummarizer(["unittest-mk-6", "-H", "--group-by", "3-2", "--values", "1", file1Path],
1218                    file1,
1219                    [["fld3", "fld2", "fld1_values"],
1220                     ["3",  "a",  "a"],
1221                     ["2b", "a",  "c"],
1222                     ["",   "bc", "c|"],
1223                     ["2b", "c",  "a"],
1224                     ["3",  "bc", "c"]]
1225         );
1226     testSummarizer(["unittest-mk-7", "-H", "--group-by", "2,1,3", "--values", "2", file1Path],
1227                    file1,
1228                    [["fld2", "fld1", "fld3", "fld2_values"],
1229                     ["a",  "a", "3",  "a"],
1230                     ["a",  "c", "2b", "a"],
1231                     ["bc", "c", "",   "bc"],
1232                     ["c",  "a", "2b", "c"],
1233                     ["bc", "",  "",   "bc"],
1234                     ["bc", "c", "3",  "bc"]]
1235         );
1236 
1237     /* Missing policies. */
1238     testSummarizer(["unittest-mis-1", "--header", "--group-by", "1", "--values", "1", "--exclude-missing", file1Path],
1239                    file1,
1240                    [["fld1", "fld1_values"],
1241                     ["a", "a|a"],
1242                     ["c", "c|c|c"],
1243                     ["",  ""]]
1244         );
1245     testSummarizer(["unittest-mis-2", "-H", "--group-by", "1", "--values", "2", "-x", file1Path],
1246                    file1,
1247                    [["fld1", "fld2_values"],
1248                     ["a", "a|c"],
1249                     ["c", "a|bc|bc"],
1250                     ["",  "bc"]]
1251         );
1252     testSummarizer(["unittest-mis-3", "-H", "-g", "1", "--values", "3", "-x", file1Path],
1253                    file1,
1254                    [["fld1", "fld3_values"],
1255                     ["a", "3|2b"],
1256                     ["c", "2b|3"],
1257                     ["",  ""]]
1258         );
1259     testSummarizer(["unittest-mis-4", "-H", "--group-by", "1", "--values", "1,2,3", "-x", file1Path],
1260                    file1,
1261                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1262                     ["a", "a|a",   "a|c",     "3|2b"],
1263                     ["c", "c|c|c", "a|bc|bc", "2b|3"],
1264                     ["",  "",      "bc",      ""]]
1265         );
1266     testSummarizer(["unittest-mis-5", "--header", "--group-by", "1", "--values", "1", "--replace-missing", "NA", file1Path],
1267                    file1,
1268                    [["fld1", "fld1_values"],
1269                     ["a", "a|a"],
1270                     ["c", "c|c|c"],
1271                     ["",  "NA"]]
1272         );
1273     testSummarizer(["unittest-mis-6", "-H", "--group-by", "1", "--values", "2", "-r", "NA", file1Path],
1274                    file1,
1275                    [["fld1", "fld2_values"],
1276                     ["a", "a|c"],
1277                     ["c", "a|bc|bc"],
1278                     ["",  "bc"]]
1279         );
1280     testSummarizer(["unittest-mis-7", "-H", "-g", "1", "--values", "3", "-r", "NA", file1Path],
1281                    file1,
1282                    [["fld1", "fld3_values"],
1283                     ["a", "3|2b"],
1284                     ["c", "2b|NA|3"],
1285                     ["",  "NA"]]
1286         );
1287     testSummarizer(["unittest-mis-8", "-H", "--group-by", "1", "--values", "1,2,3", "-r", "NA", file1Path],
1288                    file1,
1289                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1290                     ["a", "a|a",   "a|c",     "3|2b"],
1291                     ["c", "c|c|c", "a|bc|bc", "2b|NA|3"],
1292                     ["",  "NA",      "bc",      "NA"]]
1293         );
1294     testSummarizer(["unittest-mis-9", "-H", "--group-by", "1,2", "--values", "3,1", "-x", file1Path],
1295                    file1,
1296                    [["fld1", "fld2", "fld3_values", "fld1_values"],
1297                     ["a", "a",  "3", "a"],
1298                     ["c", "a",  "2b", "c"],
1299                     ["c", "bc", "3", "c|c"],
1300                     ["a", "c",  "2b", "a"],
1301                     ["",  "bc", "",   ""]]
1302         );
1303     testSummarizer(["unittest-mis-10", "-H", "--group-by", "3,2", "--values", "1", "-x", file1Path],
1304                    file1,
1305                    [["fld3", "fld2", "fld1_values"],
1306                     ["3",  "a",  "a"],
1307                     ["2b", "a",  "c"],
1308                     ["",   "bc", "c"],
1309                     ["2b", "c",  "a"],
1310                     ["3",  "bc", "c"]]
1311         );
1312     testSummarizer(["unittest-mis-11", "-H", "--group-by", "2,1,3", "--values", "2", "-x", file1Path],
1313                    file1,
1314                    [["fld2", "fld1", "fld3", "fld2_values"],
1315                     ["a",  "a", "3",  "a"],
1316                     ["a",  "c", "2b", "a"],
1317                     ["bc", "c", "",   "bc"],
1318                     ["c",  "a", "2b", "c"],
1319                     ["bc", "",  "",   "bc"],
1320                     ["bc", "c", "3",  "bc"]]
1321         );
1322     testSummarizer(["unittest-mis-12", "-H", "--group-by", "1,2", "--values", "3,1", "-r", "NA", file1Path],
1323                    file1,
1324                    [["fld1", "fld2", "fld3_values", "fld1_values"],
1325                     ["a", "a",  "3", "a"],
1326                     ["c", "a",  "2b", "c"],
1327                     ["c", "bc", "NA|3", "c|c"],
1328                     ["a", "c",  "2b", "a"],
1329                     ["",  "bc", "NA",   "NA"]]
1330         );
1331     testSummarizer(["unittest-mis-13", "-H", "--group-by", "3,2", "--values", "1", "-r", "NA", file1Path],
1332                    file1,
1333                    [["fld3", "fld2", "fld1_values"],
1334                     ["3",  "a",  "a"],
1335                     ["2b", "a",  "c"],
1336                     ["",   "bc", "c|NA"],
1337                     ["2b", "c",  "a"],
1338                     ["3",  "bc", "c"]]
1339         );
1340     testSummarizer(["unittest-mis-14", "-H", "--group-by", "2,1,3", "--values", "2", "-r", "NA", file1Path],
1341                    file1,
1342                    [["fld2", "fld1", "fld3", "fld2_values"],
1343                     ["a",  "a", "3",  "a"],
1344                     ["a",  "c", "2b", "a"],
1345                     ["bc", "c", "",   "bc"],
1346                     ["c",  "a", "2b", "c"],
1347                     ["bc", "",  "",   "bc"],
1348                     ["bc", "c", "3",  "bc"]]
1349         );
1350 
1351     /* Validate that the no-key summarizer works with testSummarizer helper function.
1352      */
1353     testSummarizer(["unittest-nk-1", "-H", "--values", "1,2", file1Path],
1354                    file1,
1355                    [["fld1_values", "fld2_values"],
1356                     ["a|c|c|a||c", "a|a|bc|c|bc|bc"]]
1357         );
1358 
1359     /* Header variations: no header line; auto-generated header line; custom headers.
1360      */
1361     testSummarizer(["unittest-hdr-1", "--group-by", "1", "--values", "1", file1NoHeaderPath],
1362                    file1[1..$],
1363                    [["a", "a|a"],
1364                     ["c", "c|c|c"],
1365                     ["",  ""]]
1366         );
1367     testSummarizer(["unittest-hdr-2", "--group-by", "1,2", "--values", "2", file1NoHeaderPath],
1368                    file1[1..$],
1369                    [["a", "a",  "a"],
1370                     ["c", "a",  "a"],
1371                     ["c", "bc", "bc|bc"],
1372                     ["a", "c",  "c"],
1373                     ["", "bc",  "bc"]]
1374         );
1375     testSummarizer(["unittest-hdr-3", "--write-header", "--group-by", "2", "--values", "1", file1NoHeaderPath],
1376                    file1[1..$],
1377                    [["field2", "field1_values"],
1378                     ["a",  "a|c"],
1379                     ["bc", "c||c"],
1380                     ["c",  "a"]]
1381         );
1382     testSummarizer(["unittest-hdr-4", "-w", "--group-by", "3,2", "--values", "1", file1NoHeaderPath],
1383                    file1[1..$],
1384                    [["field3", "field2", "field1_values"],
1385                     ["3",  "a",  "a"],
1386                     ["2b", "a",  "c"],
1387                     ["",   "bc", "c|"],
1388                     ["2b", "c",  "a"],
1389                     ["3",  "bc", "c"]]
1390         );
1391     testSummarizer(["unittest-hdr-5", "-H", "--group-by", "2", "--values", "3:Field3Values", file1Path],
1392                    file1,
1393                    [["fld2", "Field3Values"],
1394                     ["a",  "3|2b"],
1395                     ["bc", "||3"],
1396                     ["c",  "2b"]]
1397         );
1398     testSummarizer(["unittest-hdr-6", "-H", "--group-by", "1,2", "--values", "3:FieldThreeValues", "--values", "1:FieldOneValues", file1Path],
1399                    file1,
1400                    [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"],
1401                     ["a", "a",  "3", "a"],
1402                     ["c", "a",  "2b", "c"],
1403                     ["c", "bc", "|3", "c|c"],
1404                     ["a", "c",  "2b", "a"],
1405                     ["",  "bc", "",   ""]]
1406         );
1407     testSummarizer(["unittest-hdr-7", "--write-header", "--group-by", "1", "--values", "3:f3_vals","--values", "2:f2_vals", "--values", "1:f1_vals", file1NoHeaderPath],
1408                    file1[1..$],
1409                    [["field1", "f3_vals", "f2_vals", "f1_vals"],
1410                     ["a", "3|2b",  "a|c",     "a|a"],
1411                     ["c", "2b||3", "a|bc|bc", "c|c|c"],
1412                     ["",  "",      "bc",      ""]]
1413         );
1414     testSummarizer(["unittest-hdr-8", "--write-header", "--group-by", "1,3,2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2", file1NoHeaderPath],
1415                    file1[1..$],
1416                    [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"],
1417                     ["a", "3",  "a",  "3",  "a", "a"],
1418                     ["c", "2b", "a",  "2b", "c", "a"],
1419                     ["c", "",   "bc", "",   "c", "bc"],
1420                     ["a", "2b", "c",  "2b", "a", "c"],
1421                     ["",  "",   "bc", "",   "",  "bc"],
1422                     ["c", "3",  "bc", "3",  "c", "bc"]]
1423         );
1424     testSummarizer(["unittest-hdr-9", "--write-header", "--group-by", "1,3-2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2", file1NoHeaderPath],
1425                    file1[1..$],
1426                    [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"],
1427                     ["a", "3",  "a",  "3",  "a", "a"],
1428                     ["c", "2b", "a",  "2b", "c", "a"],
1429                     ["c", "",   "bc", "",   "c", "bc"],
1430                     ["a", "2b", "c",  "2b", "a", "c"],
1431                     ["",  "",   "bc", "",   "",  "bc"],
1432                     ["c", "3",  "bc", "3",  "c", "bc"]]
1433         );
1434 
1435     /* Alternate file widths and lengths.
1436      */
1437 
1438     auto file3x2 = [["fld1", "fld2", "fld3"],
1439                     ["a", "b", "c"],
1440                     ["c", "b", "a"]];
1441 
1442     auto file3x2Path = buildPath(testDir, "file3x2.tsv");
1443     auto file3x2NoHeaderPath = buildPath(testDir, "file3x2_noheader.tsv");
1444     writeDataFile(file3x2Path, file3x2);
1445     writeDataFile(file3x2NoHeaderPath, file3x2[1 .. $]);
1446 
1447     testSummarizer(["unittest-3x2-1", "-H", "--group-by", "1", "--values", "3", file3x2Path],
1448                    file3x2,
1449                    [["fld1", "fld3_values"],
1450                     ["a", "c"],
1451                     ["c", "a"]]
1452         );
1453     testSummarizer(["unittest-3x2-2", "-H", "--group-by", "2", "--values", "3", file3x2Path],
1454                    file3x2,
1455                    [["fld2", "fld3_values"],
1456                     ["b", "c|a"]]
1457         );
1458     testSummarizer(["unittest-3x2-3", "-H", "--group-by", "2,1", "--values", "3", file3x2Path],
1459                    file3x2,
1460                    [["fld2", "fld1", "fld3_values"],
1461                     ["b", "a", "c"],
1462                     ["b", "c", "a"]]
1463         );
1464 
1465     auto file3x1 = [["fld1", "fld2", "fld3"],
1466                     ["a", "b", "c"]];
1467 
1468     auto file3x1Path = buildPath(testDir, "file3x1.tsv");
1469     auto file3x1NoHeaderPath = buildPath(testDir, "file3x1_noheader.tsv");
1470     writeDataFile(file3x1Path, file3x1);
1471     writeDataFile(file3x1NoHeaderPath, file3x1[1 .. $]);
1472 
1473     testSummarizer(["unittest-3x1-1", "-H", "--group-by", "1", "--values", "3", file3x1Path],
1474                    file3x1,
1475                    [["fld1", "fld3_values"],
1476                     ["a", "c"]]
1477         );
1478     testSummarizer(["unittest-3x1-2", "--group-by", "1", "--values", "3", file3x1NoHeaderPath],
1479                    file3x1[1..$],
1480                    [["a", "c"]]
1481         );
1482     testSummarizer(["unittest-3x1-3", "-H", "--group-by", "2,1", "--values", "3", file3x1Path],
1483                    file3x1,
1484                    [["fld2", "fld1", "fld3_values"],
1485                     ["b", "a", "c"]]
1486         );
1487     testSummarizer(["unittest-3x1-4", "--group-by", "2,1", "--values", "3", file3x1NoHeaderPath],
1488                    file3x1[1..$],
1489                    [["b", "a", "c"]]
1490         );
1491 
1492     auto file3x0 = [["fld1", "fld2", "fld3"]];
1493 
1494     auto file3x0Path = buildPath(testDir, "file3x0.tsv");
1495     auto file3x0NoHeaderPath = buildPath(testDir, "file3x0_noheader.tsv");
1496     writeDataFile(file3x0Path, file3x0);
1497     writeDataFile(file3x0NoHeaderPath, file3x0[1 .. $]);
1498 
1499 
1500     testSummarizer(["unittest-3x0-1", "-H", "--group-by", "1", "--values", "3", file3x0Path],
1501                    file3x0,
1502                    [["fld1", "fld3_values"]]
1503         );
1504     testSummarizer(["unittest-3x0-2", "--group-by", "1", "--values", "3", file3x0NoHeaderPath],
1505                    file3x0[1..$],
1506                    []
1507         );
1508     testSummarizer(["unittest-3x0-3", "--write-header", "--group-by", "1", "--values", "3", file3x0NoHeaderPath],
1509                    file3x0[1..$],
1510                    [["field1", "field3_values"]]
1511         );
1512 
1513 
1514     testSummarizer(["unittest-3x0-4", "-H", "--group-by", "2,1", "--values", "3", file3x0Path],
1515                    file3x0,
1516                    [["fld2", "fld1", "fld3_values"]]
1517         );
1518 
1519     testSummarizer(["unittest-3x0-5", "--group-by", "2,1", "--values", "3", file3x0NoHeaderPath],
1520                    file3x0[1..$],
1521                    []
1522         );
1523 
1524     testSummarizer(["unittest-3x0-6", "--write-header", "--group-by", "2,1", "--values", "3", file3x0NoHeaderPath],
1525                    file3x0[1..$],
1526                    [["field2", "field1", "field3_values"]]
1527         );
1528 
1529     auto file2x1 = [["fld1", "fld2"],
1530                     ["a", "b"]];
1531 
1532     auto file2x1Path = buildPath(testDir, "file2x1.tsv");
1533     auto file2x1NoHeaderPath = buildPath(testDir, "file2x1_noheader.tsv");
1534     writeDataFile(file2x1Path, file2x1);
1535     writeDataFile(file2x1NoHeaderPath, file2x1[1 .. $]);
1536 
1537     testSummarizer(["unittest-2x1-1", "-H", "--group-by", "1", "--values", "2", file2x1Path],
1538                    file2x1,
1539                    [["fld1", "fld2_values"],
1540                     ["a", "b"]]
1541         );
1542     testSummarizer(["unittest-2x1-2", "-H", "--group-by", "2,1", "--values", "1", file2x1Path],
1543                    file2x1,
1544                    [["fld2", "fld1", "fld1_values"],
1545                     ["b", "a", "a"]]
1546         );
1547 
1548     auto file2x0 = [["fld1", "fld2"]];
1549 
1550     auto file2x0Path = buildPath(testDir, "file2x0.tsv");
1551     auto file2x0NoHeaderPath = buildPath(testDir, "file2x0_noheader.tsv");
1552     writeDataFile(file2x0Path, file2x0);
1553     writeDataFile(file2x0NoHeaderPath, file2x0[1 .. $]);
1554 
1555     testSummarizer(["unittest-2x0-1", "-H", "--group-by", "1", "--values", "2", file2x0Path],
1556                    file2x0,
1557                    [["fld1", "fld2_values"]]
1558         );
1559     testSummarizer(["unittest-2x0-2", "-H", "--group-by", "2,1", "--values", "1", file2x0Path],
1560                    file2x0,
1561                    [["fld2", "fld1", "fld1_values"]]
1562         );
1563 
1564     auto file1x2 = [["fld1"],
1565                     ["a"],
1566                     [""]];
1567 
1568     auto file1x2Path = buildPath(testDir, "file1x2.tsv");
1569     auto file1x2NoHeaderPath = buildPath(testDir, "file1x2_noheader.tsv");
1570     writeDataFile(file1x2Path, file1x2);
1571     writeDataFile(file1x2NoHeaderPath, file1x2[1 .. $]);
1572 
1573     testSummarizer(["unittest-1x2-1", "-H", "--group-by", "1", "--values", "1", file1x2Path],
1574                    file1x2,
1575                    [["fld1", "fld1_values"],
1576                     ["a", "a"],
1577                     ["",  ""]]
1578         );
1579 
1580     auto file1x2b = [["fld1"],
1581                      [""],
1582                      [""]];
1583 
1584     auto file1x2bPath = buildPath(testDir, "file1x2b.tsv");
1585     auto file1x2bNoHeaderPath = buildPath(testDir, "file1x2b_noheader.tsv");
1586     writeDataFile(file1x2bPath, file1x2b);
1587     writeDataFile(file1x2bNoHeaderPath, file1x2b[1 .. $]);
1588 
1589     testSummarizer(["unittest-1x2b-2", "-H", "--group-by", "1", "--values", "1", file1x2bPath],
1590                    file1x2b,
1591                    [["fld1", "fld1_values"],
1592                     ["", "|"]]
1593         );
1594 
1595     auto file1x1 = [["fld1"],
1596                     ["x"]];
1597 
1598     auto file1x1Path = buildPath(testDir, "file1x1.tsv");
1599     auto file1x1NoHeaderPath = buildPath(testDir, "file1x1_noheader.tsv");
1600     writeDataFile(file1x1Path, file1x1);
1601     writeDataFile(file1x1NoHeaderPath, file1x1[1 .. $]);
1602 
1603     testSummarizer(["unittest-1x1-1", "-H", "--group-by", "1", "--values", "1", file1x1Path],
1604                    file1x1,
1605                    [["fld1", "fld1_values"],
1606                     ["x", "x"]]
1607         );
1608 
1609     testSummarizer(["unittest-1x1-2", "--group-by", "1", "--values", "1", file1x1NoHeaderPath],
1610                    file1x1[1..$],
1611                    [["x", "x"]]
1612         );
1613 
1614     testSummarizer(["unittest-1x1-3", "--write-header", "--group-by", "1", "--values", "1", file1x1NoHeaderPath],
1615                    file1x1[1..$],
1616                    [["field1", "field1_values"],
1617                     ["x", "x"]]
1618         );
1619 
1620     auto file1x1b = [["fld1"],
1621                     [""]];
1622 
1623     auto file1x1bPath = buildPath(testDir, "file1x1b.tsv");
1624     auto file1x1bNoHeaderPath = buildPath(testDir, "file1x1b_noheader.tsv");
1625     writeDataFile(file1x1bPath, file1x1b);
1626     writeDataFile(file1x1bNoHeaderPath, file1x1b[1 .. $]);
1627 
1628     testSummarizer(["unittest-1x1b-1", "-H", "--group-by", "1", "--values", "1", file1x1bPath],
1629                    file1x1b,
1630                    [["fld1", "fld1_values"],
1631                     ["", ""]]
1632         );
1633 
1634     auto file1x0 = [["fld1"]];
1635 
1636     auto file1x0Path = buildPath(testDir, "file1x0.tsv");
1637     auto file1x0NoHeaderPath = buildPath(testDir, "file1x0_noheader.tsv");
1638     writeDataFile(file1x0Path, file1x0);
1639     writeDataFile(file1x0NoHeaderPath, file1x0[1 .. $]);
1640 
1641     testSummarizer(["unittest-1x0-1", "-H", "--group-by", "1", "--values", "1", file1x0Path],
1642                    file1x0,
1643                    [["fld1", "fld1_values"]]
1644         );
1645 
1646     testSummarizer(["unittest-1x0-2", "--group-by", "1", "--values", "1", file1x0NoHeaderPath],
1647                    file1x0[1..$],
1648                    []
1649         );
1650 
1651     testSummarizer(["unittest-1x0-3", "--write-header", "--group-by", "1", "--values", "1", file1x0NoHeaderPath],
1652                    file1x0[1..$],
1653                    [["field1", "field1_values"]]
1654         );
1655 
1656     /* Alternate delimiters. */
1657     testSummarizer(["unittest-delim-1", "-H", "--values", "1,2", "--delimiter", "%", file1Path],
1658                    file1,
1659                    [["fld1_values", "fld2_values"],
1660                     ["a|c|c|a||c", "a|a|bc|c|bc|bc"]]
1661         );
1662     testSummarizer(["unittest-delim-2", "-H", "--values", "1-2", "--values-delimiter", "$", file1Path],
1663                    file1,
1664                    [["fld1_values", "fld2_values"],
1665                     ["a$c$c$a$$c", "a$a$bc$c$bc$bc"]]
1666         );
1667     testSummarizer(["unittest-delim-3", "-H", "--values", "1,2", "--delimiter", "#", "--values-delimiter", ",", file1Path],
1668                    file1,
1669                    [["fld1_values", "fld2_values"],
1670                     ["a,c,c,a,,c", "a,a,bc,c,bc,bc"]]
1671         );
1672     testSummarizer(["unittest-delim-4", "--write-header", "--group-by", "2", "--values", "1",
1673                     "--delimiter", "^", "--values-delimiter", ":", file1NoHeaderPath],
1674                    file1[1..$],
1675                    [["field2", "field1_values"],
1676                     ["a",  "a:c"],
1677                     ["bc", "c::c"],
1678                     ["c",  "a"]]
1679         );
1680     testSummarizer(["unittest-delim-5", "--group-by", "1,2", "--values", "2", "--delimiter", "/",
1681                     "--values-delimiter", "\\", file1NoHeaderPath],
1682                    file1[1..$],
1683                    [["a", "a",  "a"],
1684                     ["c", "a",  "a"],
1685                     ["c", "bc", "bc\\bc"],
1686                     ["a", "c",  "c"],
1687                     ["", "bc",  "bc"]]
1688         );
1689 }
1690 
1691 /* Summary Operators and Calculators
1692  *
1693  * Two types of objects are used in implementation: Operators and Calculators. An Operator
1694  * represents a summary calculation specified on the command line, e.g. '--mean 5'. A
1695  * Calculator is used to manage the summary calculation for each unique key in the input.
1696  *
1697  * As an example, consider the command:
1698  *
1699  *    $tsv-summarize --group-by 1 --mean 3 --mean 5
1700  *
1701  * This command will create two instances of a MeanOperator, one each for fields 3 and 5.
1702  * They produce the output field headers (e.g. "field3_mean", "field5_mean"). They also
1703  * create MeanCalculator objects for each unique value in field 1. For 'mean', a
1704  * calculator needs to track occurrence count and sum. Calculators produce the final
1705  * value when all processing is finished.
1706  *
1707  * Summary field headers
1708  *
1709  * There are several options for specifying summary field headers. The defaults combine the
1710  * operator name and the header of the field summarized. The defaults can be overridden on
1711  * on the command line. These scenarios are supported via the operator constructor and the
1712  * processHeaderLine() method.
1713  *
1714  * Missing field policy
1715  *
1716  * At present, tsv-summarize has a single policy for handling missing values that applies
1717  * to all operators. However, it is logically operator specific and is implemented that
1718  * way. The MissingFieldPolicy struct describes the policy, each operator contains one.
1719  * Calculators access thier operator's policy struct.
1720  */
1721 
1722 /** An Operator represents a summary calculation specified on the command line.
1723  *  e.g. '--mean 5'.
1724  */
1725 interface Operator
1726 {
1727     @property string header();
1728     @property string name();
1729     void processHeaderLine(const char[][] fields);
1730     size_t[] numericFieldsToSave();     // Numeric fields this Operator needs saved
1731     size_t[] textFieldsToSave();        // Text fields this Operator needs saved
1732     Calculator makeCalculator();
1733 }
1734 
1735 /** Calculators are responsible for the calculation of a single computation. They
1736  *  process each line and produce the final value when all processing is finished.
1737  */
1738 interface Calculator
1739 {
1740     void processNextLine(const char[][] fields);
1741     string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions);
1742 }
1743 
1744 /** This class describes processing behavior when a missing value is encountered.
1745  */
1746 final class MissingFieldPolicy
1747 {
1748     private bool _useMissing = true;          // True if missing values are processed unchanged.
1749     private bool _replaceMissing = false;     // True if missing values are replaced.
1750     private string _missingReplacement;       // Replacement string if replaceMissing is true.
1751 
1752     this (const bool excludeMissing = false, string missingReplacement = "")
1753     {
1754         updatePolicy(excludeMissing, missingReplacement);
1755     }
1756 
1757     void updatePolicy(const bool excludeMissing, string missingReplacement)
1758     {
1759         _missingReplacement = missingReplacement;
1760         _replaceMissing = missingReplacement.length != 0;
1761         _useMissing = !excludeMissing && !replaceMissing;
1762     }
1763 
1764     final bool isMissingField(const char[] field) const
1765     {
1766         return field.length == 0;
1767     }
1768 
1769     final bool useMissing() const @property
1770     {
1771         return _useMissing;
1772     }
1773 
1774     final bool excludeMissing() const @property
1775     {
1776         return !_useMissing && !_replaceMissing;
1777     }
1778 
1779     final bool replaceMissing() const @property
1780     {
1781         return _replaceMissing;
1782     }
1783 
1784     final string missingReplacement() const @property
1785     {
1786         return _missingReplacement;
1787     }
1788 }
1789 
1790 /* The SharedFieldValues and UniqueKeyValuesLists classes manage lists of values collected
1791  * while reading data. Operations like median collect all values and operate on them when
1792  * running the final calculation. Value lists are needed for each unique key. A command
1793  * using multiple Operators may save multiple fields. And, different Operators may be run
1794  * against the same field.
1795  *
1796  * The last part motivates these classes. Handling large data sets necessitates minimizing
1797  * in-memory storage, making it desirable to share identical lists between Calculators.
1798  * Otherwise, each Calculator could implement its own storage, which would be simpler.
1799  *
1800  * The setup works as follows:
1801  *  - Operators advertise fields they need saved ([text|numeric]FieldsToSave methods).
1802  *  - The SummarizerBase object keeps a SharedFieldValues object, which in turn keeps list
1803  *    of the fields advertised by Operators as needing sharing. This list gets created
1804  *    during command initialization (SummarizerBase.setOperators).
1805  *  - The SharedFieldValues object is used to create a UniqueKeyValuesLists object every
1806  *    time a new unique key is found, in parellel to the Calculator objects created for the
1807  *    key. The UniqueKeyValuesLists objects are managed by derived Summarizer classes.
1808  *  - A unique key's UniqueKeyValuesLists object is passed each input line, same as
1809  *    Calculators, saving the values.
1810  *  - Calculators retrieve the saved values during the calculation phase. The calculator's
1811  *    ProcessNextField method is typically a no-op.
1812  *  - Calculators cannot make assumptions about the order of the saved values. This is
1813  *    pragmatic concession to median and quantile calculations, which need to sort the data,
1814  *    at least partially. Rather than generate sorted copies, the current algorithms
1815  *    sort the data in place.
1816  *
1817  * One concession to duplicate storage is that text and numeric versions of the same
1818  * field might be stored. The reason is because it's important to convert text to numbers
1819  * as they are read so that useful error messages can be generated. And, storing both
1820  * forms of the same field should be less common.
1821  *
1822  * The current implementation uses the same missing values policy for all fields. If
1823  * multiple policies become supported this will need to change.
1824  *
1825  * Built-in calculations - UniqueKeyValueLists have a built-in median operation. This is
1826  * to avoid repeated calculations of the median by different calculations.
1827  */
1828 
1829 final class SharedFieldValues
1830 {
1831     // Arrays with field indices that need to be saved.
1832     private size_t[] _numericFieldIndices;
1833     private size_t[] _textFieldIndices;
1834 
1835     /* Called during summarizer setup to add a shared field value for a specific field index.
1836      * eg. '--median 7' will add end up calling addNumericIdex(6), 6 being the zero-based index.
1837      * A specific index is only added once.
1838      */
1839     final void addNumericIndex (size_t index)
1840     {
1841         if (!canFind(_numericFieldIndices, index)) _numericFieldIndices ~= index;
1842     }
1843 
1844     /* Similar to addNumericIndex, except adds a text index. */
1845     final void addTextIndex (size_t index)
1846     {
1847         if (!canFind(_textFieldIndices, index)) _textFieldIndices ~= index;
1848     }
1849 
1850     /* Called every time a new key is found, or once at the beginning of the program if no keys
1851      * are being used (entire column summarized).
1852      */
1853     final UniqueKeyValuesLists makeUniqueKeyValuesLists()
1854     {
1855         return new UniqueKeyValuesLists(_numericFieldIndices, _textFieldIndices);
1856     }
1857 }
1858 
1859 final class UniqueKeyValuesLists
1860 {
1861     /* A FieldValues object holds is a list of values collect for a specific field. A
1862      * unique key may hold several. For example, the command:
1863      *     $ tsv-summarize --k 1 --median 4 -- median 5
1864      * requires keeping lists for both fields 4 and 5. This in turn will result in a
1865      * _numericFieldValues being a 2 element array, one with a list of field 4 values,
1866      * the second of field 5 values. Linear search is used to find a specific field.
1867      */
1868     private FieldValues!double[] _numericFieldValues;
1869     private FieldValues!string[] _textFieldValues;
1870     private double[] _numericFieldMedians;
1871 
1872     /* The UniqueKeyValuesLists constructor takes arrays of field indices to be saved. */
1873     this(const size_t[] numericFieldIndices, const size_t[] textFieldIndices)
1874     {
1875         if (numericFieldIndices.length > 0)
1876         {
1877             _numericFieldValues = new FieldValues!double[](numericFieldIndices.length);
1878             foreach (i, fieldIndex; numericFieldIndices)
1879                 _numericFieldValues[i] = new FieldValues!double(fieldIndex);
1880         }
1881 
1882         if (textFieldIndices.length > 0)
1883         {
1884             _textFieldValues = new FieldValues!string[](textFieldIndices.length);
1885             foreach (i, fieldIndex; textFieldIndices)
1886                 _textFieldValues[i] = new FieldValues!string(fieldIndex);
1887         }
1888     }
1889 
1890     void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy)
1891     {
1892         _numericFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy));
1893         _textFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy));
1894     }
1895 
1896     private FieldValues!double findNumericFieldValues(size_t index)
1897     {
1898         alias pred = (FieldValues!double a, size_t b) => (a.fieldIndex == b);
1899         auto r = find!pred(_numericFieldValues, index);
1900         assert(!r.empty);
1901         return r.front;
1902     }
1903 
1904     private FieldValues!string findTextFieldValues(size_t index)
1905     {
1906         alias pred = (FieldValues!string a, size_t b) => (a.fieldIndex == b);
1907         auto r = find!pred(_textFieldValues, index);
1908         assert(!r.empty);
1909         return r.front;
1910     }
1911 
1912     final double[] numericValues(size_t index)
1913     {
1914         return findNumericFieldValues(index).getArray;
1915     }
1916 
1917     final double[] numericValuesSorted(size_t index)
1918     {
1919         return findNumericFieldValues(index).getSortedArray;
1920     }
1921 
1922     final string[] textValues(size_t index)
1923     {
1924         return findTextFieldValues(index).getArray;
1925     }
1926 
1927     final string[] textValuesSorted(size_t index)
1928     {
1929         return findTextFieldValues(index).getSortedArray;
1930     }
1931 
1932     final double numericValuesMedian(size_t index)
1933     {
1934         return findNumericFieldValues(index).median;
1935     }
1936 
1937     private final class FieldValues(ValueType)
1938     {
1939         import std.array : appender;
1940         private size_t _fieldIndex;
1941         private Appender!(ValueType[]) _values;
1942         private bool _haveMedian = false;
1943         private bool _isSorted = false;
1944         private ValueType _medianValue;
1945 
1946         this(size_t fieldIndex)
1947         {
1948             _fieldIndex = fieldIndex;
1949         }
1950 
1951         final size_t length() const @property
1952         {
1953             return _values.data.length;
1954         }
1955 
1956         final size_t fieldIndex() const @property
1957         {
1958             return _fieldIndex;
1959         }
1960 
1961         final void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy)
1962         {
1963             debug writefln("[%s]: %s", __FUNCTION__, fields.to!string);
1964 
1965             const char[] field = fields[_fieldIndex];
1966             if (missingPolicy.useMissing || !missingPolicy.isMissingField(field))
1967             {
1968                 _values.put(field.to!ValueType);
1969                 _haveMedian = false;
1970                 _isSorted = false;
1971             }
1972             else if (missingPolicy.replaceMissing)
1973             {
1974                 _values.put(missingPolicy.missingReplacement.to!ValueType);
1975                 _haveMedian = false;
1976                 _isSorted = false;
1977             }
1978         }
1979 
1980         /* Return an input range of the values. */
1981         final auto values()
1982         {
1983             return _values.data;
1984         }
1985 
1986         final ValueType[] getArray()
1987         {
1988             return _values.data;
1989         }
1990 
1991         final ValueType[] getSortedArray()
1992         {
1993             if (!_isSorted)
1994             {
1995                 import std.algorithm : sort;
1996                 sort(_values.data);
1997                 _isSorted = true;
1998             }
1999             return _values.data;
2000         }
2001 
2002         final ValueType median()
2003         {
2004             if (!_haveMedian)
2005             {
2006                 import tsv_utils.common.numerics : rangeMedian;
2007                 _medianValue = _values.data.rangeMedian();
2008                 _haveMedian = true;
2009             }
2010 
2011             return _medianValue;
2012         }
2013     }
2014 }
2015 
2016 /** SingleFieldOperator is a base class for single field operators, the most common
2017  * Operator. Derived classes implement makeCalculator and the Calculator class it returns.
2018  */
2019 class SingleFieldOperator : Operator
2020 {
2021     import std.typecons : Flag;
2022 
2023     private string _name;
2024     private string _header;
2025     private size_t _fieldIndex;
2026     private bool _useHeaderSuffix;
2027     private bool _allowCustomHeader;
2028     private bool _hasCustomHeader = false;
2029     private size_t[] _numericFieldsToSave;
2030     private size_t[] _textFieldsToSave;
2031     private MissingFieldPolicy _missingPolicy;
2032 
2033     this(string operatorName, size_t fieldIndex, MissingFieldPolicy missingPolicy,
2034          Flag!"useHeaderSuffix" useHeaderSuffix = Yes.useHeaderSuffix,
2035          Flag!"allowCustomHeader" allowCustomHeader = Yes.allowCustomHeader)
2036     {
2037         _name = operatorName;
2038         _fieldIndex = fieldIndex;
2039         _missingPolicy = missingPolicy;
2040         _useHeaderSuffix = useHeaderSuffix;
2041         _allowCustomHeader = allowCustomHeader;
2042         // Default header. May be overrridden by custom header or header line.
2043         _header =
2044             fieldHeaderFromIndex(fieldIndex)
2045             .summaryHeaderFromFieldHeader(_useHeaderSuffix ? operatorName : "");
2046     }
2047 
2048     void setCustomHeader (string customHeader)
2049     {
2050         assert(_allowCustomHeader);
2051         _header = customHeader;
2052         _hasCustomHeader = true;
2053     }
2054 
2055     final string name() const @property
2056     {
2057         return _name;
2058     }
2059 
2060     final bool allowCustomHeader() const @property
2061     {
2062         return _allowCustomHeader;
2063     }
2064 
2065     /* saveFieldValues[Numeric|Text] are called by derived classes to indicate that field
2066      * that the field values should be saved. These should called during construction.
2067      */
2068     final void setSaveFieldValuesNumeric()
2069     {
2070         _numericFieldsToSave ~= _fieldIndex;
2071     }
2072 
2073     final void setSaveFieldValuesText()
2074     {
2075         _textFieldsToSave ~= _fieldIndex;
2076     }
2077 
2078     final MissingFieldPolicy missingPolicy() @property
2079     {
2080         return _missingPolicy;
2081     }
2082 
2083     final size_t fieldIndex() const @property
2084     {
2085         return _fieldIndex;
2086     }
2087 
2088     final string header() const @property
2089     {
2090         return _header;
2091     }
2092 
2093     final bool useHeaderSuffix() const @property
2094     {
2095         return _useHeaderSuffix;
2096     }
2097 
2098     void processHeaderLine(const char[][] fields)
2099     {
2100         if (!_hasCustomHeader) {
2101             debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string);
2102             _header = summaryHeaderFromFieldHeader(fields[_fieldIndex].to!string,
2103                                                    _useHeaderSuffix ? _name : "");
2104         }
2105     }
2106 
2107     final size_t[] numericFieldsToSave()
2108     {
2109         return _numericFieldsToSave;
2110     }
2111 
2112     final size_t[] textFieldsToSave()
2113     {
2114         return _textFieldsToSave;
2115     }
2116 
2117     abstract SingleFieldCalculator makeCalculator();
2118 }
2119 
2120 /** SingleFieldCalculator is a base class for the common case of calculators using a single
2121  * field. Derived classes implement processNextField() rather than processNextLine().
2122  */
2123 class SingleFieldCalculator : Calculator
2124 {
2125     private size_t _fieldIndex;
2126 
2127     this(size_t fieldIndex)
2128     {
2129         _fieldIndex = fieldIndex;
2130     }
2131 
2132     final size_t fieldIndex() const @property
2133     {
2134         return _fieldIndex;
2135     }
2136 
2137     final void processNextLine(const char[][] fields)
2138     {
2139         debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string);
2140 
2141         auto missingPolicy = getOperator.missingPolicy;
2142         const char[] field = fields[_fieldIndex];
2143 
2144         if (missingPolicy.useMissing || !missingPolicy.isMissingField(field))
2145         {
2146             processNextField(field);
2147         }
2148         else if (missingPolicy.replaceMissing)
2149         {
2150             processNextField(missingPolicy.missingReplacement);
2151         }
2152     }
2153 
2154     abstract SingleFieldOperator getOperator();
2155 
2156     abstract void processNextField(const char[] field);
2157 }
2158 
2159 /* Unittest helper functions. Only compiled when -unittest is in effect. */
2160 version(unittest)
2161 {
2162     /** A helper for SingleFieldOperator unit tests.
2163      *
2164      * testSingleFieldOperator takes a set of split file values, a field index, a header
2165      * suffix, and a set of expected values. The expected values array contains the
2166      * initial value (zero entries) and the expected values after each line. (One more
2167      * expected value than input lines.) The zero entry case is what is generated for an
2168      * empty file. An example testing the 'min' operator against a file with 2 columns,
2169      * 3 rows, using field index 1:
2170      *
2171      *    testSingleFieldOperator!MinOperator(
2172      *       [["10", "100"],               // The split file. 3 lines by 2 rows.
2173      *        ["5", "50"],
2174      *        ["20", "200"]],
2175      *       1,                            // Field index (zero-based, so "100", "50", "200")
2176      *       "min",                        // The header suffix, normally the operator name.
2177      *       ["nan", "100", "50", "50"]);  // Min value after processing each line.
2178      *
2179      * A typical operator unit test uses three "files", one each of 1x3, 2x3, and 3x3.
2180      * Then run the operator is tested against each column, a total of six calls. Headers
2181      * are automatically checked. Additional entries can be used to extend coverage.
2182      *
2183      * A non-default MissingFieldPolicy can be provide as an optional last argument.
2184      * Operator tests should include exclusion and replacement variations. See operator
2185      * unit tests for details.
2186      *
2187      * The testSingleFieldOperatorBase adds an additional capability - Custom operator
2188      * init arguments. Currently this is used only by the quantile operator.
2189      *
2190      * These tests do not check unique key behavior (group-by). Operators don't have info
2191      * about unique keys, and interact with them only indirectly, via Calculators.
2192      */
2193     void testSingleFieldOperator(OperatorClass : SingleFieldOperator)
2194         (const char[][][] splitFile, size_t fieldIndex, string headerSuffix,
2195          const char[][] expectedValues,
2196          MissingFieldPolicy missingPolicy = new MissingFieldPolicy)
2197     {
2198         testSingleFieldOperatorBase!OperatorClass(splitFile, fieldIndex, headerSuffix, expectedValues, missingPolicy);
2199     }
2200 
2201     void testSingleFieldOperatorBase(OperatorClass : SingleFieldOperator, T...)
2202         (const char[][][] splitFile, size_t fieldIndex, string headerSuffix,
2203          const char[][] expectedValues,
2204          MissingFieldPolicy missingPolicy,
2205          T extraOpInitArgs)
2206     {
2207         import std.format : format;
2208         import std.array : appender;
2209         import std.string : chomp;
2210         import std.traits : EnumMembers;
2211 
2212         auto numFields = (splitFile[0]).length;
2213 
2214         assert(fieldIndex < numFields,
2215                format("[testSingleFieldOperator] Invalid field index. headerSuffix: %s",
2216                       headerSuffix));
2217         assert(splitFile.length + 1 == expectedValues.length,
2218                format("[testSingleFieldOperator] Need one more expected value than number of rows. headerSuffix: %s",
2219                       headerSuffix));
2220 
2221         /* printOptions - Only the 'values-delimiter' (2nd arg) is used these tests. */
2222         auto printOptions = SummarizerPrintOptions('#', '|');
2223 
2224         /* An input header line. */
2225         string[] inputHeaderLine = new string[numFields];
2226         foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string;
2227 
2228         /* The different expected output field headers. */
2229         auto outputFieldHeaderWithNoHeaderLine =
2230             fieldHeaderFromIndex(fieldIndex)
2231             .summaryHeaderFromFieldHeader(headerSuffix);
2232         auto outputFieldHeaderFromHeaderLine =
2233             inputHeaderLine[fieldIndex]
2234             .summaryHeaderFromFieldHeader(headerSuffix);
2235         auto customOutputFieldHeader = "custom";
2236 
2237         enum HeaderUsecase {
2238             HeaderLine_DefaultHeader,
2239             HeaderLine_CustomHeader,
2240             NoHeaderLine_DefaultHeader,
2241             NoHeaderLine_CustomHeader,
2242             NoHeaderLine_NoOutputHeader,
2243         }
2244 
2245         string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected)
2246         {
2247             return format("[testSingleFieldOperator] Unexpected header. Operator: %s; Usecase: %s;  Actual: '%s';  Expected: '%s'",
2248                           op.name, hc, actual, expected);
2249         }
2250 
2251         string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex, size_t fieldIndex,
2252                                   const char[] actual, const char[] expected)
2253         {
2254             return format("[testSingleFieldOperator] Unexpected value. Operator: %s; Usecase: %s;  RowIndex: %d, FieldIndex: %d\n    Actual: '%s';  Expected: '%s'",
2255                           op.name, hc, rowIndex, fieldIndex, actual, expected);
2256         }
2257 
2258         /* Run the logic for each header use case. */
2259         foreach (hc; EnumMembers!HeaderUsecase)
2260         {
2261             bool hasInputHeader = (
2262                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2263                 hc == HeaderUsecase.HeaderLine_CustomHeader
2264                 );
2265             bool hasOutputHeader = (
2266                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2267                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2268                 hc == HeaderUsecase.NoHeaderLine_DefaultHeader ||
2269                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2270                 );
2271             bool hasCustomHeader = (
2272                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2273                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2274                 );
2275 
2276             if (hasCustomHeader) assert(hasOutputHeader);
2277 
2278             auto op = new OperatorClass(fieldIndex, missingPolicy, extraOpInitArgs);
2279 
2280             if (hasCustomHeader)
2281             {
2282                 if (!op.allowCustomHeader) continue;   // Custom header not support by this operator
2283                 op.setCustomHeader(customOutputFieldHeader);
2284             }
2285 
2286             Operator[] operatorArray;
2287             operatorArray ~= op;
2288 
2289             auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy);
2290             summarizer.setOperators(inputRangeObject(operatorArray));
2291 
2292             if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine);
2293 
2294             if (hasOutputHeader)
2295             {
2296                 /* Write the header line. Note that this is a one-field header, */
2297                 auto headerLineOutput = appender!(char[])();
2298                 summarizer.writeSummaryHeader(headerLineOutput, printOptions);
2299 
2300                 /* Test that the header was generated correctly.
2301                  *
2302                  * Note: Because the output is generated by a Summarizer, it will have a
2303                  * trailing newline. Use chomp to trim it.
2304                  */
2305                 final switch (hc)
2306                 {
2307                 case HeaderUsecase.HeaderLine_DefaultHeader:
2308                     assert(headerLineOutput.data.chomp == outputFieldHeaderFromHeaderLine,
2309                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2310                                                outputFieldHeaderFromHeaderLine));
2311                     break;
2312                 case HeaderUsecase.NoHeaderLine_DefaultHeader:
2313                     assert(headerLineOutput.data.chomp == outputFieldHeaderWithNoHeaderLine,
2314                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2315                                                outputFieldHeaderWithNoHeaderLine));
2316                     break;
2317                 case HeaderUsecase.HeaderLine_CustomHeader:
2318                 case HeaderUsecase.NoHeaderLine_CustomHeader:
2319                     assert(headerLineOutput.data.chomp == customOutputFieldHeader,
2320                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2321                                                customOutputFieldHeader));
2322                     break;
2323                 case HeaderUsecase.NoHeaderLine_NoOutputHeader:
2324                     break;
2325                }
2326 
2327             }
2328 
2329             /* For each line, process the line, generate the output, and test that the
2330              * value is correct. Start with the empty file case.
2331              */
2332             foreach (i, const char[] expected; expectedValues)
2333             {
2334                 if (i > 0) summarizer.processNextLine(splitFile[i - 1]);
2335                 auto summaryLineOutput = appender!(char[])();
2336                 summarizer.writeSummaryBody(summaryLineOutput, printOptions);
2337                 assert(summaryLineOutput.data.chomp == expected,
2338                        valueAssertMessage(operatorArray[0], hc, i, fieldIndex,
2339                                           summaryLineOutput.data.chomp, expectedValues[i]));
2340             }
2341         }
2342     }
2343 }
2344 
2345 /** ZeroFieldOperator is a base class for operators that take no input. The main use
2346  * case is the CountOperator, which counts the occurrences of each unique key. Other
2347  * uses are possible, for example, weighted random number assignment.
2348  *
2349  * The primary rationale for ZeroFieldOperator and ZeroFieldCalculator is to clarify
2350  * the information available to such a routine. In particular, the split fields passed
2351  * to processHeaderLine and processNextLine don't include all fields in the input,
2352  * something that might not be obvious when implementing an operator. (Only fields
2353  * required by operators acting on specific fields are included.)
2354  */
2355 class ZeroFieldOperator : Operator
2356 {
2357     import std.typecons : Flag;
2358 
2359     private string _name;
2360     private string _header;
2361 
2362     this(string operatorName)
2363     {
2364         _name = operatorName;
2365         _header = operatorName;
2366     }
2367 
2368     void setCustomHeader (string customHeader)
2369     {
2370         _header = customHeader;
2371     }
2372 
2373     bool allowCustomHeader() const @property
2374     {
2375         return true;
2376     }
2377 
2378     final string name() const @property
2379     {
2380         return _name;
2381     }
2382 
2383     final string header() const @property
2384     {
2385         return _header;
2386     }
2387 
2388     /* A no-op. ZeroFieldOperators have no access to the header line. */
2389     final void processHeaderLine(const char[][] fields) { }
2390 
2391     /* A no-op. ZeroFieldOperators have no access to fields. */
2392     final size_t[] numericFieldsToSave()
2393     {
2394         size_t[] emptyArray;
2395         return emptyArray;
2396     }
2397 
2398     /* A no-op. ZeroFieldOperators have no access to fields. */
2399     final size_t[] textFieldsToSave()
2400     {
2401         size_t[] emptyArray;
2402         return emptyArray;
2403     }
2404 
2405     abstract ZeroFieldCalculator makeCalculator();
2406 }
2407 
2408 /** ZeroFieldCalculator is a base class for operators that don't use fields as input.
2409  * In particular, the Count operator. It is a companion to the ZeroFieldOperator class.
2410  *
2411  * Derived classes implement processNextEntry() rather than processNextLine(), and the
2412  * single argument form of calculate() given as an abstract function.
2413  */
2414 class ZeroFieldCalculator : Calculator
2415 {
2416     this() { }
2417 
2418     final void processNextLine(const char[][] fields)
2419     {
2420         debug writefln("[%s]", __FUNCTION__,);
2421         processNextEntry();
2422     }
2423 
2424     final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2425     {
2426         return calculate(printOptions);
2427     }
2428 
2429     abstract void processNextEntry();
2430     abstract string calculate(const ref SummarizerPrintOptions printOptions);
2431 }
2432 
2433 version(unittest)
2434 {
2435     /* A helper for ZeroFieldOperator unit tests.
2436      *
2437      * testZeroFieldOperator takes a set of split file values, a default header, and a
2438      * set of expected values. The expected values array contains the expected values
2439      * after each line.
2440      *
2441      * testZeroFieldOperator is very similar to testSingleFieldOperator, except that
2442      * there is no use of field indices and fewer types of headers. See the latter's
2443      * documentation and the CountOperator unit tests for examples.
2444      */
2445     void testZeroFieldOperator(OperatorClass : ZeroFieldOperator)
2446         (const char[][][] splitFile, string defaultHeader, const char[][] expectedValues)
2447     {
2448         import std.format : format;
2449         import std.array : appender;
2450         import std.string : chomp;
2451         import std.traits : EnumMembers;
2452 
2453         auto numFields = (splitFile[0]).length;
2454 
2455         assert(splitFile.length + 1 == expectedValues.length,
2456                format("[testZeroFieldOperator] Need one more expected value than number of rows. headerSuffix: %s",
2457                       defaultHeader));
2458 
2459         /* printOptions - Not used these tests, but needed for API calls. */
2460         auto printOptions = SummarizerPrintOptions('#', '|');
2461 
2462         /* Missing policy doesn't apply to zero field operators, but need the object for the summarizer. */
2463         auto missingPolicy = new MissingFieldPolicy;
2464 
2465         /* An input header line. */
2466         string[] inputHeaderLine = new string[numFields];
2467         foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string;
2468 
2469         auto customOutputFieldHeader = "custom";
2470 
2471         enum HeaderUsecase {
2472             HeaderLine_DefaultHeader,
2473             HeaderLine_CustomHeader,
2474             NoHeaderLine_DefaultHeader,
2475             NoHeaderLine_CustomHeader,
2476             NoHeaderLine_NoOutputHeader,
2477         }
2478 
2479         string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected)
2480         {
2481             return format("[testZeroFieldOperator] Unexpected header. Operator: %s; Usecase: %s;  Actual: '%s';  Expected: '%s'",
2482                           op.name, hc, actual, expected);
2483         }
2484 
2485         string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex,
2486                                   const char[] actual, const char[] expected)
2487         {
2488             return format("[testZeroFieldOperator] Unexpected value. Operator: %s; Usecase: %s;  RowIndex: %d\n    Actual: '%s';  Expected: '%s'",
2489                           op.name, hc, rowIndex, actual, expected);
2490         }
2491 
2492         /* Run the logic for each header use case. */
2493         foreach (hc; EnumMembers!HeaderUsecase)
2494         {
2495             bool hasInputHeader = (
2496                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2497                 hc == HeaderUsecase.HeaderLine_CustomHeader
2498                 );
2499             bool hasOutputHeader = (
2500                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2501                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2502                 hc == HeaderUsecase.NoHeaderLine_DefaultHeader ||
2503                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2504                 );
2505             bool hasCustomHeader = (
2506                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2507                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2508                 );
2509 
2510             if (hasCustomHeader) assert(hasOutputHeader);
2511 
2512             auto op = new OperatorClass();
2513 
2514             if (hasCustomHeader)
2515             {
2516                 if (!op.allowCustomHeader) continue;   // Custom header not support by this operator
2517                 op.setCustomHeader(customOutputFieldHeader);
2518             }
2519 
2520             Operator[] operatorArray;
2521             operatorArray ~= op;
2522 
2523             auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy);
2524             summarizer.setOperators(inputRangeObject(operatorArray));
2525             if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine);
2526 
2527             if (hasOutputHeader)
2528             {
2529                 /* Write the header line. Note that this is a one-field header, */
2530                 auto headerLineOutput = appender!(char[])();
2531                 summarizer.writeSummaryHeader(headerLineOutput, printOptions);
2532 
2533                 /* Test that the header was generated correctly.
2534                  *
2535                  * Note: Because the output is generated by a Summarizer, it will have a
2536                  * trailing newline. Use chomp to trim it.
2537                  */
2538                 final switch (hc)
2539                 {
2540                 case HeaderUsecase.HeaderLine_DefaultHeader:
2541                 case HeaderUsecase.NoHeaderLine_DefaultHeader:
2542                     assert(headerLineOutput.data.chomp == defaultHeader,
2543                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2544                                                defaultHeader));
2545                     break;
2546                 case HeaderUsecase.HeaderLine_CustomHeader:
2547                 case HeaderUsecase.NoHeaderLine_CustomHeader:
2548                     assert(headerLineOutput.data.chomp == customOutputFieldHeader,
2549                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2550                                                customOutputFieldHeader));
2551                     break;
2552                 case HeaderUsecase.NoHeaderLine_NoOutputHeader:
2553                     break;
2554                 }
2555 
2556             }
2557 
2558             /* For each line, process the line, generate the output, and test that the
2559              * value is correct. Start with the empty file case.
2560              */
2561             foreach (i, const char[] expected; expectedValues)
2562             {
2563                 if (i > 0) summarizer.processNextLine(splitFile[i - 1]);
2564                 auto summaryLineOutput = appender!(char[])();
2565                 summarizer.writeSummaryBody(summaryLineOutput, printOptions);
2566                 assert(summaryLineOutput.data.chomp == expected,
2567                        valueAssertMessage(operatorArray[0], hc, i,
2568                                           summaryLineOutput.data.chomp, expectedValues[i]));
2569             }
2570         }
2571     }
2572 }
2573 
2574 /* Specific operators.
2575  *
2576  * Notes:
2577  * - The 'Calculator' inner classes are 'static'. This means inner class instances do not
2578  *   keep a reference to the context of the outer class. In exchange, Calculator instances
2579  *   need to hold all needed state, typically the field index they are summarizing.
2580  */
2581 
2582 /** CountOperator counts the number of occurrences of each unique key, or the number of
2583  * input lines if there is no unique key.
2584  *
2585  * CountOperator differs from most other operators in that it doesn't summarize a specific
2586  * field on the line. Instead it is summarizing a property of the unique key itself. For
2587  * this reason it doesn't derive from SingleFieldOperator.
2588  */
2589 final class CountOperator : ZeroFieldOperator
2590 {
2591     this()
2592     {
2593         super("count");
2594     }
2595 
2596     final override ZeroFieldCalculator makeCalculator()
2597     {
2598         return new CountCalculator();
2599     }
2600 
2601     static final class CountCalculator : ZeroFieldCalculator
2602     {
2603         private size_t _count = 0;
2604 
2605         final override void processNextEntry()
2606         {
2607             _count++;
2608         }
2609 
2610         final override string calculate(const ref SummarizerPrintOptions printOptions)
2611         {
2612             return printOptions.formatNumber(_count);
2613         }
2614     }
2615 }
2616 
2617 unittest // CountOperator
2618 {
2619     auto col1File = [["10"], ["9.5"], ["11"]];
2620     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
2621     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
2622 
2623     testZeroFieldOperator!CountOperator(col1File, "count", ["0", "1", "2", "3"]);
2624     testZeroFieldOperator!CountOperator(col2File, "count", ["0", "1", "2", "3"]);
2625     testZeroFieldOperator!CountOperator(col3File, "count", ["0", "1", "2", "3"]);
2626 }
2627 
2628 /** RetainOperator retains the first occurrence of a field, without changing the header.
2629  *
2630  * RetainOperator is intended for fields where the value is expected to be the same for
2631  * all occurrences of the unique key, and the goal is to pass the value through unchanged.
2632  * It is like FirstOperator, except that the original header is preserved. The original
2633  * header preservation is setup in the call to the SingleFieldOperation constructor.
2634  *
2635  * Notes:
2636  * - An option to signal an error if multiple values are encountered might be useful.
2637  */
2638 final class RetainOperator : SingleFieldOperator
2639 {
2640     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2641     {
2642         super("retain", fieldIndex, missingPolicy, No.useHeaderSuffix, No.allowCustomHeader);
2643     }
2644 
2645     final override SingleFieldCalculator makeCalculator()
2646     {
2647         return new RetainCalculator(fieldIndex);
2648     }
2649 
2650     final class RetainCalculator : SingleFieldCalculator
2651     {
2652         private bool _done = false;
2653         private string _value = "";
2654 
2655         this(size_t fieldIndex)
2656         {
2657             super(fieldIndex);
2658         }
2659 
2660         final override RetainOperator getOperator()
2661         {
2662             return this.outer;
2663         }
2664 
2665         final override void processNextField(const char[] nextField)
2666         {
2667             if (!_done)
2668             {
2669                 _value = nextField.to!string;
2670                 _done = true;
2671             }
2672         }
2673 
2674         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2675        {
2676             return _value;
2677         }
2678     }
2679 }
2680 
2681 unittest // RetainOperator
2682 {
2683     auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]];
2684     auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]];
2685     auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]];
2686 
2687     testSingleFieldOperator!RetainOperator(col1File, 0, "", ["", "r1c1", "r1c1", "r1c1"]);
2688     testSingleFieldOperator!RetainOperator(col2File, 0, "", ["", "r1c1", "r1c1", "r1c1"]);
2689     testSingleFieldOperator!RetainOperator(col2File, 1, "", ["", "r1c2", "r1c2", "r1c2"]);
2690     testSingleFieldOperator!RetainOperator(col3File, 0, "", ["", "r1c1", "r1c1", "r1c1"]);
2691     testSingleFieldOperator!RetainOperator(col3File, 1, "", ["", "r1c2", "r1c2", "r1c2"]);
2692     testSingleFieldOperator!RetainOperator(col3File, 2, "", ["", "r1c3", "r1c3", "r1c3"]);
2693 
2694     auto col1misFile = [[""], ["r2c1"], ["r3c1"]];
2695     testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "", "r2c1", "r2c1"],
2696                                            new MissingFieldPolicy(true, ""));  // Exclude missing
2697     testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "NA", "NA", "NA"],
2698                                            new MissingFieldPolicy(false, "NA"));  // Replace missing
2699 }
2700 
2701 /** FirstOperator outputs the first value found for the field.
2702  */
2703 final class FirstOperator : SingleFieldOperator
2704 {
2705     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2706     {
2707         super("first", fieldIndex, missingPolicy);
2708     }
2709 
2710     final override SingleFieldCalculator makeCalculator()
2711     {
2712         return new FirstCalculator(fieldIndex);
2713     }
2714 
2715     final class FirstCalculator : SingleFieldCalculator
2716     {
2717         private bool _done = false;
2718         private string _value = "";
2719 
2720         this(size_t fieldIndex)
2721         {
2722             super(fieldIndex);
2723         }
2724 
2725         final override FirstOperator getOperator()
2726         {
2727             return this.outer;
2728         }
2729 
2730         final override void processNextField(const char[] nextField)
2731         {
2732             if (!_done)
2733             {
2734                 _value = nextField.to!string;
2735                 _done = true;
2736             }
2737         }
2738 
2739         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2740         {
2741             return _value;
2742         }
2743     }
2744 }
2745 
2746 unittest // FirstOperator
2747 {
2748     auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]];
2749     auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]];
2750     auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]];
2751 
2752     testSingleFieldOperator!FirstOperator(col1File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]);
2753     testSingleFieldOperator!FirstOperator(col2File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]);
2754     testSingleFieldOperator!FirstOperator(col2File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]);
2755     testSingleFieldOperator!FirstOperator(col3File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]);
2756     testSingleFieldOperator!FirstOperator(col3File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]);
2757     testSingleFieldOperator!FirstOperator(col3File, 2, "first", ["", "r1c3", "r1c3", "r1c3"]);
2758 
2759     auto col1misFile = [[""], ["r2c1"], ["r3c1"]];
2760     testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "", "r2c1", "r2c1"],
2761                                           new MissingFieldPolicy(true, ""));  // Exclude missing
2762     testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "NA", "NA", "NA"],
2763                                           new MissingFieldPolicy(false, "NA"));  // Replace missing
2764 }
2765 
2766 /** LastOperator outputs the last value found for the field.
2767  */
2768 final class LastOperator : SingleFieldOperator
2769 {
2770     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2771     {
2772         super("last", fieldIndex, missingPolicy);
2773     }
2774 
2775     final override SingleFieldCalculator makeCalculator()
2776     {
2777         return new LastCalculator(fieldIndex);
2778     }
2779 
2780     final class LastCalculator : SingleFieldCalculator
2781     {
2782         private string _value = "";
2783 
2784         this(size_t fieldIndex)
2785         {
2786             super(fieldIndex);
2787         }
2788 
2789         final override LastOperator getOperator()
2790         {
2791             return this.outer;
2792         }
2793 
2794         final override void processNextField(const char[] nextField)
2795         {
2796             _value = nextField.to!string;
2797         }
2798 
2799         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2800         {
2801             return _value;
2802         }
2803     }
2804 }
2805 
2806 unittest // LastOperator
2807 {
2808     auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]];
2809     auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]];
2810     auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]];
2811 
2812     testSingleFieldOperator!LastOperator(col1File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]);
2813     testSingleFieldOperator!LastOperator(col2File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]);
2814     testSingleFieldOperator!LastOperator(col2File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]);
2815     testSingleFieldOperator!LastOperator(col3File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]);
2816     testSingleFieldOperator!LastOperator(col3File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]);
2817     testSingleFieldOperator!LastOperator(col3File, 2, "last", ["", "r1c3", "r2c3", "r3c3"]);
2818 
2819     auto col1misFile = [[""], ["r2c1"], ["r3c1"]];
2820     testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "", "r2c1", "r3c1"],
2821                                           new MissingFieldPolicy(true, ""));  // Exclude missing
2822     testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "NA", "r2c1", "r3c1"],
2823                                           new MissingFieldPolicy(false, "NA"));  // Replace missing
2824 }
2825 
2826 /** MinOperator output the minimum value for the field. This is a numeric operator.
2827  *
2828  * This operator returns the original string without additional numeric formatting.
2829  * This can be useful when joining back to the original data. This is different than
2830  * numeric operators that perform calculations.
2831  */
2832 final class MinOperator : SingleFieldOperator
2833 {
2834     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2835     {
2836         super("min", fieldIndex, missingPolicy);
2837     }
2838 
2839     final override SingleFieldCalculator makeCalculator()
2840     {
2841         return new MinCalculator(fieldIndex);
2842     }
2843 
2844     final class MinCalculator : SingleFieldCalculator
2845     {
2846         private bool _isFirst = true;
2847         private double _value = double.nan;
2848         private string _originalString = "nan";  // Note: Cannot format floats at compile time (version 2.087)
2849 
2850         this(size_t fieldIndex)
2851         {
2852             super(fieldIndex);
2853         }
2854 
2855         final override MinOperator getOperator()
2856         {
2857             return this.outer;
2858         }
2859 
2860         final override void processNextField(const char[] nextField)
2861         {
2862             double fieldValue = nextField.to!double;
2863             if (_isFirst)
2864             {
2865                 _value = fieldValue;
2866                 _originalString = nextField.to!string;
2867                 _isFirst = false;
2868             }
2869             else if (fieldValue < _value)
2870             {
2871                 _value = fieldValue;
2872                 _originalString = nextField.to!string;
2873             }
2874         }
2875 
2876         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2877         {
2878             return _originalString;
2879         }
2880     }
2881 }
2882 
2883 unittest // MinOperator
2884 {
2885     auto col1File = [["10"], ["9.5"], ["11"]];
2886     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
2887     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
2888 
2889     testSingleFieldOperator!MinOperator(col1File, 0, "min", ["nan", "10", "9.5", "9.5"]);
2890     testSingleFieldOperator!MinOperator(col2File, 0, "min", ["nan", "20", "20", "20"]);
2891     testSingleFieldOperator!MinOperator(col2File, 1, "min", ["nan", "-30", "-30", "-31"]);
2892     testSingleFieldOperator!MinOperator(col3File, 0, "min", ["nan", "9009", "199", "199"]);
2893     testSingleFieldOperator!MinOperator(col3File, 1, "min", ["nan", "9", "0", "0"]);
2894     testSingleFieldOperator!MinOperator(col3File, 2, "min", ["nan", "-4.5", "-4.5", "-4.5"]);
2895 
2896     auto col1misFile = [[""], ["10"], ["-10"]];
2897     testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "nan", "10", "-10"],
2898                                           new MissingFieldPolicy(true, ""));  // Exclude missing
2899     testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "5", "5", "-10"],
2900                                           new MissingFieldPolicy(false, "5"));  // Replace missing
2901 }
2902 
2903 /** MaxOperator output the maximum value for the field. This is a numeric operator.
2904  *
2905  * This operator returns the original string without additional numeric formatting.
2906  * This can be useful when joining back to the original data. This is different than
2907  * numeric operators that perform calculations.
2908  */
2909 final class MaxOperator : SingleFieldOperator
2910 {
2911     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2912     {
2913         super("max", fieldIndex, missingPolicy);
2914     }
2915 
2916     final override SingleFieldCalculator makeCalculator()
2917     {
2918         return new MaxCalculator(fieldIndex);
2919     }
2920 
2921     final class MaxCalculator : SingleFieldCalculator
2922     {
2923         private bool _isFirst = true;
2924         private double _value = double.nan;
2925         private string _originalString = "nan";  // Note: Cannot format floats at compile time (version 2.087)
2926 
2927         this(size_t fieldIndex)
2928         {
2929             super(fieldIndex);
2930         }
2931 
2932         final override MaxOperator getOperator()
2933         {
2934             return this.outer;
2935         }
2936 
2937         final override void processNextField(const char[] nextField)
2938         {
2939             double fieldValue = nextField.to!double;
2940             if (_isFirst)
2941             {
2942                 _value = fieldValue;
2943                 _originalString = nextField.to!string;
2944                 _isFirst = false;
2945             }
2946             else if (fieldValue > _value)
2947             {
2948                 _value = fieldValue;
2949                 _originalString = nextField.to!string;
2950             }
2951         }
2952 
2953         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2954         {
2955             return _originalString;
2956         }
2957     }
2958 }
2959 
2960 unittest // MaxOperator
2961 {
2962     auto col1File = [["10"], ["9.5"], ["11"]];
2963     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
2964     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
2965 
2966     testSingleFieldOperator!MaxOperator(col1File, 0, "max", ["nan", "10", "10", "11"]);
2967     testSingleFieldOperator!MaxOperator(col2File, 0, "max", ["nan", "20", "21", "22"]);
2968     testSingleFieldOperator!MaxOperator(col2File, 1, "max", ["nan", "-30", "-29", "-29"]);
2969     testSingleFieldOperator!MaxOperator(col3File, 0, "max", ["nan", "9009", "9009", "9009"]);
2970     testSingleFieldOperator!MaxOperator(col3File, 1, "max", ["nan", "9", "9", "9"]);
2971     testSingleFieldOperator!MaxOperator(col3File, 2, "max", ["nan", "-4.5", "-0.5", "12"]);
2972 
2973     auto col1misFile = [[""], ["-10"], ["10"]];
2974     testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "nan", "-10", "10"],
2975                                           new MissingFieldPolicy(true, ""));  // Exclude missing
2976     testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "5", "5", "10"],
2977                                           new MissingFieldPolicy(false, "5"));  // Replace missing
2978 }
2979 
2980 /** RangeOperator outputs the difference between the minimum and maximum values.
2981  *
2982  * If there is a single value, or all values are the same, the range is zero. This is
2983  * a numeric operator.
2984  */
2985 final class RangeOperator : SingleFieldOperator
2986 {
2987     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2988     {
2989         super("range", fieldIndex, missingPolicy);
2990     }
2991 
2992     final override SingleFieldCalculator makeCalculator()
2993     {
2994         return new RangeCalculator(fieldIndex);
2995     }
2996 
2997     final class RangeCalculator : SingleFieldCalculator
2998     {
2999         private bool _isFirst = true;
3000         private double _minValue = 0.0;
3001         private double _maxValue = 0.0;
3002 
3003         this(size_t fieldIndex)
3004         {
3005             super(fieldIndex);
3006         }
3007 
3008         final override RangeOperator getOperator()
3009         {
3010             return this.outer;
3011         }
3012 
3013         final override void processNextField(const char[] nextField)
3014         {
3015             double fieldValue = nextField.to!double;
3016             if (_isFirst)
3017             {
3018                 _minValue = _maxValue = fieldValue;
3019                 _isFirst = false;
3020             }
3021             else if (fieldValue > _maxValue)
3022             {
3023                 _maxValue = fieldValue;
3024             }
3025             else if (fieldValue < _minValue)
3026             {
3027                 _minValue = fieldValue;
3028             }
3029         }
3030 
3031         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3032         {
3033             return printOptions.formatNumber(_maxValue - _minValue);
3034         }
3035     }
3036 }
3037 
3038 unittest // RangeOperator
3039 {
3040     auto col1File = [["10"], ["9.5"], ["11"]];
3041     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3042     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
3043 
3044     testSingleFieldOperator!RangeOperator(col1File, 0, "range", ["0", "0", "0.5", "1.5"]);
3045     testSingleFieldOperator!RangeOperator(col2File, 0, "range", ["0", "0", "1", "2"]);
3046     testSingleFieldOperator!RangeOperator(col2File, 1, "range", ["0", "0", "1", "2"]);
3047     testSingleFieldOperator!RangeOperator(col3File, 0, "range", ["0", "0", "8810", "8810"]);
3048     testSingleFieldOperator!RangeOperator(col3File, 1, "range", ["0", "0", "9", "9"]);
3049     testSingleFieldOperator!RangeOperator(col3File, 2, "range", ["0", "0", "4", "16.5"]);
3050 
3051     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]];
3052     testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "0", "0", "0.5", "1.5"],
3053                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3054     testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "4.5", "4.5", "4.5", "5.5"],
3055                                           new MissingFieldPolicy(false, "5.5"));  // Replace missing
3056 }
3057 
3058 /** SumOperator produces the sum of all the values. This is a numeric operator.
3059  */
3060 final class SumOperator : SingleFieldOperator
3061 {
3062     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3063     {
3064         super("sum", fieldIndex, missingPolicy);
3065     }
3066 
3067     final override SingleFieldCalculator makeCalculator()
3068     {
3069         return new SumCalculator(fieldIndex);
3070     }
3071 
3072     final class SumCalculator : SingleFieldCalculator
3073     {
3074         private double _total = 0.0;
3075 
3076         this(size_t fieldIndex)
3077         {
3078             super(fieldIndex);
3079         }
3080 
3081         final override SumOperator getOperator()
3082         {
3083             return this.outer;
3084         }
3085 
3086         final override void processNextField(const char[] nextField)
3087         {
3088             _total += nextField.to!double;
3089         }
3090 
3091         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3092         {
3093             return printOptions.formatNumber(_total);
3094         }
3095     }
3096 }
3097 
3098 unittest // SumOperator
3099 {
3100     auto col1File = [["10"], ["9.5"], ["11"]];
3101     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3102     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
3103 
3104     testSingleFieldOperator!SumOperator(col1File, 0, "sum", ["0", "10", "19.5", "30.5"]);
3105     testSingleFieldOperator!SumOperator(col2File, 0, "sum", ["0", "20", "41", "63"]);
3106     testSingleFieldOperator!SumOperator(col2File, 1, "sum", ["0", "-30", "-59", "-90"]);
3107     testSingleFieldOperator!SumOperator(col3File, 0, "sum", ["0", "9009", "9208", "12211"]);
3108     testSingleFieldOperator!SumOperator(col3File, 1, "sum", ["0", "9", "9", "9.2"]);
3109     testSingleFieldOperator!SumOperator(col3File, 2, "sum", ["0", "-4.5", "-5", "7"]);
3110 
3111     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]];
3112     testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "0", "10", "10", "19.5", "30.5"],
3113                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3114     testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "1.5", "11.5", "13", "22.5", "33.5"],
3115                                           new MissingFieldPolicy(false, "1.5"));  // Replace missing
3116 }
3117 
3118 /** MeanOperator produces the mean (average) of all the values. This is a numeric operator.
3119  */
3120 final class MeanOperator : SingleFieldOperator
3121 {
3122     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3123     {
3124         super("mean", fieldIndex, missingPolicy);
3125     }
3126 
3127     final override SingleFieldCalculator makeCalculator()
3128     {
3129         return new MeanCalculator(fieldIndex);
3130     }
3131 
3132     final class MeanCalculator : SingleFieldCalculator
3133     {
3134         private double _total = 0.0;
3135         private size_t _count = 0;
3136 
3137         this(size_t fieldIndex)
3138         {
3139             super(fieldIndex);
3140         }
3141 
3142         final override MeanOperator getOperator()
3143         {
3144             return this.outer;
3145         }
3146 
3147         final override void processNextField(const char[] nextField)
3148         {
3149             _total += nextField.to!double;
3150             _count++;
3151         }
3152 
3153         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3154         {
3155             return printOptions.formatNumber(
3156                 (_count > 0) ? (_total / _count.to!double) : double.nan);
3157         }
3158     }
3159 }
3160 
3161 unittest // MeanOperator
3162 {
3163     auto col1File = [["10"], ["9.5"], ["7.5"]];
3164     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3165     auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]];
3166 
3167     testSingleFieldOperator!MeanOperator(col1File, 0, "mean", ["nan", "10", "9.75", "9"]);
3168     testSingleFieldOperator!MeanOperator(col2File, 0, "mean", ["nan", "20", "20.5", "21"]);
3169     testSingleFieldOperator!MeanOperator(col2File, 1, "mean", ["nan", "-30", "-29.5", "-30"]);
3170     testSingleFieldOperator!MeanOperator(col3File, 0, "mean", ["nan", "9009", "4509", "4509"]);
3171     testSingleFieldOperator!MeanOperator(col3File, 1, "mean", ["nan", "9", "4.5", "2"]);
3172     testSingleFieldOperator!MeanOperator(col3File, 2, "mean", ["nan", "-4.5", "-3", "2"]);
3173 
3174     auto col1misFile = [[""], ["6"], [""], ["14"], ["40"]];
3175     testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "nan", "6", "6", "10", "20"],
3176                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3177     testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "0", "3", "2", "5", "12"],
3178                                           new MissingFieldPolicy(false, "0"));  // Replace missing
3179 }
3180 
3181 /** MedianOperator produces the median of all the values. This is a numeric operator.
3182  *
3183  * All the field values are stored in memory as part of this calculation. This is
3184  * handled by unique key value lists.
3185  */
3186 final class MedianOperator : SingleFieldOperator
3187 {
3188     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3189     {
3190         super("median", fieldIndex, missingPolicy);
3191         setSaveFieldValuesNumeric();
3192     }
3193 
3194     final override SingleFieldCalculator makeCalculator()
3195     {
3196         return new MedianCalculator(fieldIndex);
3197     }
3198 
3199     final class MedianCalculator : SingleFieldCalculator
3200     {
3201         this(size_t fieldIndex)
3202         {
3203             super(fieldIndex);
3204         }
3205 
3206         final override MedianOperator getOperator()
3207         {
3208             return this.outer;
3209         }
3210 
3211         /* Work is done by saving the field values. */
3212         final override void processNextField(const char[] nextField)
3213         { }
3214 
3215         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3216         {
3217             return printOptions.formatNumber(valuesLists.numericValuesMedian(fieldIndex));
3218         }
3219     }
3220 }
3221 
3222 unittest // MedianOperator
3223 {
3224     auto col1File = [["10"], ["9.5"], ["7.5"]];
3225     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3226     auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]];
3227 
3228     testSingleFieldOperator!MedianOperator(col1File, 0, "median", ["nan", "10", "9.75", "9.5"]);
3229     testSingleFieldOperator!MedianOperator(col2File, 0, "median", ["nan", "20", "20.5", "21"]);
3230     testSingleFieldOperator!MedianOperator(col2File, 1, "median", ["nan", "-30", "-29.5", "-30"]);
3231     testSingleFieldOperator!MedianOperator(col3File, 0, "median", ["nan", "9009", "4509", "4509"]);
3232     testSingleFieldOperator!MedianOperator(col3File, 1, "median", ["nan", "9", "4.5", "0"]);
3233     testSingleFieldOperator!MedianOperator(col3File, 2, "median", ["nan", "-4.5", "-3", "-1.5"]);
3234 
3235     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]];
3236     testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "nan", "10", "10", "9.75", "9.5"],
3237                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3238     testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "0", "5", "0", "4.75", "7.5"],
3239                                           new MissingFieldPolicy(false, "0"));  // Replace missing
3240 }
3241 
3242 /** QuantileOperator produces the value representing the data at a cummulative probability.
3243  * This is a numeric operation.
3244  *
3245  * As an example, quantiles might be produced for the 0.25, 0.5, and 0.75 probabilities
3246  * (alternately, the 25th, 50th, and 75th percentile ranks, the 50th percentile being the
3247  * median). Data is sorted is ascending order. This operator takes one percentile, but it
3248  * is common to generate multiple quantile ranks for the same field when summarizing.
3249  *
3250  * All the field's values are stored in memory as part of this calculation. This is
3251  * handled by unique key value lists.
3252  */
3253 final class QuantileOperator : SingleFieldOperator
3254 {
3255     private double _prob;
3256 
3257     this(size_t fieldIndex, MissingFieldPolicy missingPolicy, double probability)
3258     {
3259         assert(0.0 <= probability && probability <= 1.0);
3260         import std.format : format;
3261 
3262         string header = (probability == 0.0) ? "pct0" : format("pct%02g", probability * 100.0);
3263         super(header, fieldIndex, missingPolicy);
3264         _prob = probability;
3265         setSaveFieldValuesNumeric();
3266     }
3267 
3268     final override SingleFieldCalculator makeCalculator()
3269     {
3270         return new QuantileCalculator(fieldIndex);
3271     }
3272 
3273     final class QuantileCalculator : SingleFieldCalculator
3274     {
3275         this(size_t fieldIndex)
3276         {
3277             super(fieldIndex);
3278         }
3279 
3280         final override QuantileOperator getOperator()
3281         {
3282             return this.outer;
3283         }
3284 
3285         /* Work is done by saving the field values. */
3286         final override void processNextField(const char[] nextField)
3287         { }
3288 
3289         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3290         {
3291             import tsv_utils.common.numerics : quantile;
3292             return printOptions.formatNumber(
3293                 quantile(this.outer._prob, valuesLists.numericValuesSorted(fieldIndex)));
3294         }
3295     }
3296 }
3297 
3298 unittest // QuantileOperator
3299 {
3300     auto col1File = [["10"], ["9.5"], ["7.5"]];
3301     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3302     auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]];
3303 
3304     auto defaultMissing = new MissingFieldPolicy;
3305 
3306     /* Same as the median tests. */
3307     testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct50", ["nan", "10", "9.75", "9.5"], defaultMissing, 0.50);
3308     testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct50", ["nan", "20", "20.5", "21"], defaultMissing, 0.50);
3309     testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct50", ["nan", "-30", "-29.5", "-30"], defaultMissing, 0.50);
3310     testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct50", ["nan", "9009", "4509", "4509"], defaultMissing, 0.50);
3311     testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct50", ["nan", "9", "4.5", "0"], defaultMissing, 0.50);
3312     testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct50", ["nan", "-4.5", "-3", "-1.5"], defaultMissing, 0.50);
3313 
3314     /* The extremes (0, 1), are min and max. */
3315     testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct0", ["nan", "10", "9.5", "7.5"], defaultMissing, 0.0);
3316     testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct0", ["nan", "20", "20", "20"], defaultMissing, 0.0);
3317     testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct0", ["nan", "-30", "-30", "-31"], defaultMissing, 0.0);
3318     testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct0", ["nan", "9009", "9", "9"], defaultMissing, 0.0);
3319     testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct0", ["nan", "9", "0", "-3"], defaultMissing, 0.0);
3320     testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct0", ["nan", "-4.5", "-4.5", "-4.5"], defaultMissing, 0.0);
3321 
3322     testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct100", ["nan", "10", "10", "10"], defaultMissing, 1.0);
3323     testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct100", ["nan", "20", "21", "22"], defaultMissing, 1.0);
3324     testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct100", ["nan", "-30", "-29", "-29"], defaultMissing, 1.0);
3325     testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct100", ["nan", "9009", "9009", "9009"], defaultMissing, 1.0);
3326     testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct100", ["nan", "9", "9", "9"], defaultMissing, 1.0);
3327     testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct100", ["nan", "-4.5", "-1.5", "12"], defaultMissing, 1.0);
3328 
3329     /* For missing policies, re-use the median tests. */
3330     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]];
3331     testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "nan", "10", "10", "9.75", "9.5"],
3332                                                  new MissingFieldPolicy(true, ""), 0.5);  // Exclude missing
3333     testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "0", "5", "0", "4.75", "7.5"],
3334                                                  new MissingFieldPolicy(false, "0"), 0.5);  // Replace missing
3335 }
3336 
3337 /** MadOperator produces the median absolute deviation from the median. This is a numeric
3338  * operation.
3339  *
3340  * The result is the raw MAD value, without a normalization applied.
3341  *
3342  * All the field values are stored in memory as part of this calculation. This is
3343  * handled by unique key value lists.
3344  */
3345 final class MadOperator : SingleFieldOperator
3346 {
3347     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3348     {
3349         super("mad", fieldIndex, missingPolicy);
3350         setSaveFieldValuesNumeric();
3351     }
3352 
3353     final override SingleFieldCalculator makeCalculator()
3354     {
3355         return new MadCalculator(fieldIndex);
3356     }
3357 
3358     final class MadCalculator : SingleFieldCalculator
3359     {
3360         this(size_t fieldIndex)
3361         {
3362             super(fieldIndex);
3363         }
3364 
3365         final override MadOperator getOperator()
3366         {
3367             return this.outer;
3368         }
3369 
3370         /* Work is done by saving the field values. */
3371         final override void processNextField(const char[] nextField)
3372         { }
3373 
3374         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3375         {
3376             import std.math : abs;
3377             import tsv_utils.common.numerics : rangeMedian;
3378 
3379             auto median = valuesLists.numericValuesMedian(fieldIndex);
3380             auto values = valuesLists.numericValues(fieldIndex);
3381             auto medianDevs = new double[values.length];
3382             foreach (size_t i, double v; values)
3383                 medianDevs[i] = abs(v - median);
3384 
3385             return printOptions.formatNumber(medianDevs.rangeMedian);
3386         }
3387     }
3388 }
3389 
3390 unittest // MadOperator
3391 {
3392     auto col1File = [["10"], ["15"], ["20"], ["25"], ["30"]];
3393     auto col2File = [["2", "50"], ["2", "51"], ["2", "52"]];
3394     auto col3File = [["16", "8", "-4"], ["8", "8", "-2"], ["8", "16", "0"]];
3395 
3396     testSingleFieldOperator!MadOperator(col1File, 0, "mad", ["nan", "0", "2.5", "5", "5", "5"]);
3397     testSingleFieldOperator!MadOperator(col2File, 0, "mad", ["nan", "0", "0", "0"]);
3398     testSingleFieldOperator!MadOperator(col2File, 1, "mad", ["nan", "0", "0.5", "1"]);
3399     testSingleFieldOperator!MadOperator(col3File, 0, "mad", ["nan", "0", "4", "0"]);
3400     testSingleFieldOperator!MadOperator(col3File, 1, "mad", ["nan", "0", "0", "0"]);
3401     testSingleFieldOperator!MadOperator(col3File, 2, "mad", ["nan", "0", "1", "2"]);
3402 
3403     auto col1misFile = [[""], ["16"], [""], ["32"], ["-4"]];
3404     testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "nan", "0", "0", "8", "16"],
3405                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3406     testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "0", "8", "0", "8", "4"],
3407                                           new MissingFieldPolicy(false, "0"));  // Replace missing
3408 }
3409 
3410 /** Generates the variance of the fields values. This is a numeric operator.
3411  */
3412 final class VarianceOperator : SingleFieldOperator
3413 {
3414     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3415     {
3416         super("var", fieldIndex, missingPolicy);
3417     }
3418 
3419     final override SingleFieldCalculator makeCalculator()
3420     {
3421         return new VarianceCalculator(fieldIndex);
3422     }
3423 
3424     final class VarianceCalculator : SingleFieldCalculator
3425     {
3426         private double _count = 0.0;
3427         private double _mean = 0.0;
3428         private double _m2 = 0.0;     // Sum of squares of differences from current mean
3429 
3430         this(size_t fieldIndex)
3431         {
3432             super(fieldIndex);
3433         }
3434 
3435         final override VarianceOperator getOperator()
3436         {
3437             return this.outer;
3438         }
3439 
3440         final override void processNextField(const char[] nextField)
3441         {
3442             _count += 1.0;
3443             double fieldValue = nextField.to!double;
3444             double delta = fieldValue - _mean;
3445             _mean += delta / _count;
3446             _m2 += delta * (fieldValue - _mean);
3447         }
3448 
3449         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3450         {
3451             return printOptions.formatNumber(
3452                 (_count >= 2.0) ? (_m2 / (_count - 1.0)) : double.nan);
3453         }
3454     }
3455 }
3456 
3457 unittest // VarianceOperator
3458 {
3459     auto col1File = [["5"], ["10"], ["15"]];
3460     auto col2File = [["-5", "-5"], ["-10", "0"], ["-15", "5"]];
3461     auto col3File = [["1", "2", "100"], ["2", "3", "100"], ["3", "4", "103"]];
3462 
3463     testSingleFieldOperator!VarianceOperator(col1File, 0, "var", ["nan", "nan", "12.5", "25"]);
3464     testSingleFieldOperator!VarianceOperator(col2File, 0, "var", ["nan", "nan", "12.5", "25"]);
3465     testSingleFieldOperator!VarianceOperator(col2File, 1, "var", ["nan", "nan", "12.5", "25"]);
3466     testSingleFieldOperator!VarianceOperator(col3File, 0, "var", ["nan", "nan", "0.5", "1"]);
3467     testSingleFieldOperator!VarianceOperator(col3File, 1, "var", ["nan", "nan", "0.5", "1"]);
3468     testSingleFieldOperator!VarianceOperator(col3File, 2, "var", ["nan", "nan", "0", "3"]);
3469 
3470     auto col1misFile = [["5"], ["10"], [""]];
3471     testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "12.5"],
3472                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3473     testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "25"],
3474                                           new MissingFieldPolicy(false, "15"));  // Replace missing
3475 }
3476 
3477 /** Generates the standard deviation of the fields values. This is a numeric operator.
3478  */
3479 final class StDevOperator : SingleFieldOperator
3480 {
3481     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3482     {
3483         super("stdev", fieldIndex, missingPolicy);
3484     }
3485 
3486     final override SingleFieldCalculator makeCalculator()
3487     {
3488         return new StDevCalculator(fieldIndex);
3489     }
3490 
3491     final class StDevCalculator : SingleFieldCalculator
3492     {
3493         private double _count = 0.0;
3494         private double _mean = 0.0;
3495         private double _m2 = 0.0;     // Sum of squares of differences from current mean
3496 
3497         this(size_t fieldIndex)
3498         {
3499             super(fieldIndex);
3500         }
3501 
3502         final override StDevOperator getOperator()
3503         {
3504             return this.outer;
3505         }
3506 
3507         final override void processNextField(const char[] nextField)
3508         {
3509             _count += 1.0;
3510             double fieldValue = nextField.to!double;
3511             double delta = fieldValue - _mean;
3512             _mean += delta / _count;
3513             _m2 += delta * (fieldValue - _mean);
3514         }
3515 
3516         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3517         {
3518             import std.math : sqrt;
3519             return printOptions.formatNumber(
3520                 (_count >= 2.0) ? (_m2 / (_count - 1.0)).sqrt : double.nan);
3521         }
3522     }
3523 }
3524 
3525 /* StDevOperator unit tests - These would be improved with a tolerance option.
3526  */
3527 unittest
3528 {
3529     auto col1File = [["1"], ["4"], ["7"]];
3530     auto col2File = [["3", "3"], ["3", "9"], ["7", "15"]];
3531     auto col3File = [["11", "10", "10"], ["24", "22", "25"], ["37", "34", "40"]];
3532 
3533     testSingleFieldOperator!StDevOperator(col1File, 0, "stdev", ["nan", "nan", "2.12132034356", "3"]);
3534     testSingleFieldOperator!StDevOperator(col2File, 0, "stdev", ["nan", "nan", "0", "2.30940107676"]);
3535     testSingleFieldOperator!StDevOperator(col2File, 1, "stdev", ["nan", "nan", "4.24264068712", "6"]);
3536     testSingleFieldOperator!StDevOperator(col3File, 0, "stdev", ["nan", "nan", "9.19238815543", "13"]);
3537     testSingleFieldOperator!StDevOperator(col3File, 1, "stdev", ["nan", "nan", "8.48528137424", "12"]);
3538     testSingleFieldOperator!StDevOperator(col3File, 2, "stdev", ["nan", "nan", "10.6066017178", "15"]);
3539 
3540     auto col1misFile = [["1"], ["4"], [""]];
3541     testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "2.12132034356"],
3542                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3543     testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "3"],
3544                                           new MissingFieldPolicy(false, "7"));  // Replace missing
3545 }
3546 
3547 /** UniqueCountOperator generates the number of unique values. Unique values are
3548  * based on exact text match calculation, not a numeric comparison.
3549  *
3550  * All the unique field values are stored in memory as part of this calculation.
3551  */
3552 final class UniqueCountOperator : SingleFieldOperator
3553 {
3554     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3555     {
3556         super("unique_count", fieldIndex, missingPolicy);
3557     }
3558 
3559     final override SingleFieldCalculator makeCalculator()
3560     {
3561         return new UniqueCountCalculator(fieldIndex);
3562     }
3563 
3564     final class UniqueCountCalculator : SingleFieldCalculator
3565     {
3566         private bool[string] _values;
3567 
3568         this(size_t fieldIndex)
3569         {
3570             super(fieldIndex);
3571         }
3572 
3573         final override UniqueCountOperator getOperator()
3574         {
3575             return this.outer;
3576         }
3577 
3578         final override void processNextField(const char[] nextField)
3579         {
3580             if (nextField !in _values) _values[nextField.to!string] = true;
3581         }
3582 
3583         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3584         {
3585             return printOptions.formatNumber(_values.length);
3586         }
3587     }
3588 }
3589 
3590 unittest // UniqueCount
3591 {
3592     auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]];
3593     auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]];
3594     auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]];
3595 
3596     testSingleFieldOperator!UniqueCountOperator(col1File, 0, "unique_count", ["0", "1", "2", "3", "3", "3", "3", "3", "4"]);
3597     testSingleFieldOperator!UniqueCountOperator(col2File, 0, "unique_count", ["0", "1", "2", "2"]);
3598     testSingleFieldOperator!UniqueCountOperator(col2File, 1, "unique_count", ["0", "1", "1", "2"]);
3599     testSingleFieldOperator!UniqueCountOperator(col3File, 0, "unique_count", ["0", "1", "2", "3"]);
3600     testSingleFieldOperator!UniqueCountOperator(col3File, 1, "unique_count", ["0", "1", "2", "2"]);
3601     testSingleFieldOperator!UniqueCountOperator(col3File, 2, "unique_count", ["0", "1", "2", "3"]);
3602 
3603     auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]];
3604     testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "0", "1", "1", "2", "3", "3", "3", "3", "3", "4"],
3605                                                 new MissingFieldPolicy(true, ""));  // Exclude missing
3606 
3607 
3608     testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "1", "2", "2", "3", "4", "4", "4", "4", "4", "5"],
3609                                                 new MissingFieldPolicy(false, "XYZ"));  // Replace missing
3610 }
3611 
3612 /** MissingCountOperator generates the number of missing values. This overrides
3613  * the global missingFieldsPolicy.
3614  */
3615 final class MissingCountOperator : SingleFieldOperator
3616 {
3617     private MissingFieldPolicy _globalMissingPolicy;
3618 
3619     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3620     {
3621         _globalMissingPolicy = missingPolicy;
3622         super("missing_count", fieldIndex, new MissingFieldPolicy(false, ""));
3623     }
3624 
3625     final override SingleFieldCalculator makeCalculator()
3626     {
3627         return new MissingCountCalculator(fieldIndex);
3628     }
3629 
3630     final class MissingCountCalculator : SingleFieldCalculator
3631     {
3632         private size_t _missingCount = 0;
3633 
3634         this(size_t fieldIndex)
3635         {
3636             super(fieldIndex);
3637         }
3638 
3639         final override MissingCountOperator getOperator()
3640         {
3641             return this.outer;
3642         }
3643 
3644         final override void processNextField(const char[] nextField)
3645         {
3646             if (this.outer._globalMissingPolicy.isMissingField(nextField)) _missingCount++;
3647         }
3648 
3649         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3650         {
3651             return printOptions.formatNumber(_missingCount);
3652         }
3653     }
3654 }
3655 
3656 unittest // MissingCount
3657 {
3658     auto col1File = [["a"], ["b"], [""], [" "], [""]];
3659     auto col2File = [["abc", ""], ["", ""], ["def", ""]];
3660     auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]];
3661 
3662     testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"]);
3663     testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"]);
3664     testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"]);
3665     testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"]);
3666     testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"]);
3667     testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"]);
3668 
3669     auto excludeMissing = new MissingFieldPolicy(true, "");
3670     auto replaceMissing = new MissingFieldPolicy(false, "X");
3671 
3672     testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], excludeMissing);
3673     testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], excludeMissing);
3674     testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], excludeMissing);
3675     testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], excludeMissing);
3676     testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], excludeMissing);
3677     testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], excludeMissing);
3678 
3679     testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], replaceMissing);
3680     testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], replaceMissing);
3681     testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], replaceMissing);
3682     testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], replaceMissing);
3683     testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], replaceMissing);
3684     testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], replaceMissing);
3685 }
3686 
3687 /** NotMissingCountOperator generates the number of not-missing values. This overrides
3688  * the global missingFieldsPolicy.
3689  */
3690 final class NotMissingCountOperator : SingleFieldOperator
3691 {
3692     private MissingFieldPolicy _globalMissingPolicy;
3693 
3694     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3695     {
3696         _globalMissingPolicy = missingPolicy;
3697         super("not_missing_count", fieldIndex, new MissingFieldPolicy(false, ""));
3698     }
3699 
3700     final override SingleFieldCalculator makeCalculator()
3701     {
3702         return new NotMissingCountCalculator(fieldIndex);
3703     }
3704 
3705     final class NotMissingCountCalculator : SingleFieldCalculator
3706     {
3707         private size_t _notMissingCount = 0;
3708 
3709         this(size_t fieldIndex)
3710         {
3711             super(fieldIndex);
3712         }
3713 
3714         final override NotMissingCountOperator getOperator()
3715         {
3716             return this.outer;
3717         }
3718 
3719         final override void processNextField(const char[] nextField)
3720         {
3721             if (!this.outer._globalMissingPolicy.isMissingField(nextField)) _notMissingCount++;
3722         }
3723 
3724         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3725         {
3726             return printOptions.formatNumber(_notMissingCount);
3727         }
3728     }
3729 }
3730 
3731 unittest // NotMissingCount
3732 {
3733     auto col1File = [["a"], ["b"], [""], [" "], [""]];
3734     auto col2File = [["abc", ""], ["", ""], ["def", ""]];
3735     auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]];
3736 
3737     testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"]);
3738     testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"]);
3739     testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"]);
3740     testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"]);
3741     testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"]);
3742     testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"]);
3743 
3744     auto excludeMissing = new MissingFieldPolicy(true, "");
3745     auto replaceMissing = new MissingFieldPolicy(false, "X");
3746 
3747     testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], excludeMissing);
3748     testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], excludeMissing);
3749     testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], excludeMissing);
3750     testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], excludeMissing);
3751     testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], excludeMissing);
3752     testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], excludeMissing);
3753 
3754     testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], replaceMissing);
3755     testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], replaceMissing);
3756     testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], replaceMissing);
3757     testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], replaceMissing);
3758     testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], replaceMissing);
3759     testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], replaceMissing);
3760 }
3761 
3762 /** ModeOperator outputs the most frequent value seen. In the event of a tie, the
3763  * first value seen is produced.
3764  *
3765  * All the field values are stored in memory as part of this calculation.
3766  *
3767  */
3768 final class ModeOperator : SingleFieldOperator
3769 {
3770     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3771     {
3772         super("mode", fieldIndex, missingPolicy);
3773     }
3774 
3775     final override SingleFieldCalculator makeCalculator()
3776     {
3777         return new ModeCalculator(fieldIndex);
3778     }
3779 
3780     final class ModeCalculator : SingleFieldCalculator
3781     {
3782         private size_t[string] _valueCounts;
3783         private Appender!(string[]) _uniqueValues;
3784 
3785         this(size_t fieldIndex)
3786         {
3787             super(fieldIndex);
3788         }
3789 
3790         final override ModeOperator getOperator()
3791         {
3792             return this.outer;
3793         }
3794 
3795         final override void processNextField(const char[] nextField)
3796         {
3797             auto countPtr = (nextField in _valueCounts);
3798 
3799             if (countPtr is null)
3800             {
3801                 string value = nextField.to!string;
3802                 _uniqueValues.put(value);
3803                 _valueCounts[value] = 1;
3804             }
3805             else
3806             {
3807                 (*countPtr)++;
3808             }
3809         }
3810 
3811         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3812         {
3813             string modeValue = "";
3814             size_t modeCount = 0;
3815 
3816             foreach (value; _uniqueValues.data)
3817             {
3818                 assert(value in _valueCounts);
3819 
3820                 auto count = _valueCounts[value];
3821 
3822                 if (count > modeCount)
3823                 {
3824                     modeValue = value;
3825                     modeCount = count;
3826                 }
3827             }
3828 
3829             return modeValue;
3830         }
3831     }
3832 }
3833 
3834 unittest // ModeOperator
3835 {
3836     auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]];
3837     auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]];
3838     auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]];
3839 
3840     testSingleFieldOperator!ModeOperator(col1File, 0, "mode", ["", "a", "a", "a", "c", "b", "b", "b"]);
3841     testSingleFieldOperator!ModeOperator(col2File, 0, "mode", ["", "abc", "abc", "def"]);
3842     testSingleFieldOperator!ModeOperator(col2File, 1, "mode", ["", "pqr", "pqr", "pqr"]);
3843     testSingleFieldOperator!ModeOperator(col3File, 0, "mode", ["", "1.0", "1.0", "1.0"]);
3844     testSingleFieldOperator!ModeOperator(col3File, 1, "mode", ["", "1", "1", "a"]);
3845     testSingleFieldOperator!ModeOperator(col3File, 2, "mode", ["", "a", "a", "a"]);
3846 
3847     auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]];
3848     testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "", "a", "a", "a", "a", "c", "b", "b"],
3849                                          new MissingFieldPolicy(true, ""));  // Exclude missing
3850 
3851 
3852     testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "X", "X", "X", "X", "X", "X", "X", "b"],
3853                                          new MissingFieldPolicy(false, "X"));  // Replace missing
3854 }
3855 
3856 /** ModeCountOperator outputs the count of the most frequent value seen.
3857  *
3858  * All the field values are stored in memory as part of this calculation.
3859  *
3860  */
3861 final class ModeCountOperator : SingleFieldOperator
3862 {
3863     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3864     {
3865         super("mode_count", fieldIndex, missingPolicy);
3866     }
3867 
3868     final override SingleFieldCalculator makeCalculator()
3869     {
3870         return new ModeCountCalculator(fieldIndex);
3871     }
3872 
3873     final class ModeCountCalculator : SingleFieldCalculator
3874     {
3875         private size_t[string] _valueCounts;
3876 
3877         this(size_t fieldIndex)
3878         {
3879             super(fieldIndex);
3880         }
3881 
3882         final override ModeCountOperator getOperator()
3883         {
3884             return this.outer;
3885         }
3886 
3887         final override void processNextField(const char[] nextField)
3888         {
3889             auto countPtr = (nextField in _valueCounts);
3890 
3891             if (countPtr is null)
3892             {
3893                 string value = nextField.to!string;
3894                 _valueCounts[value] = 1;
3895             }
3896             else
3897             {
3898                 (*countPtr)++;
3899             }
3900         }
3901 
3902         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3903         {
3904             size_t modeCount = 0;
3905             foreach (count; _valueCounts.byValue) if (count > modeCount) modeCount = count;
3906             return printOptions.formatNumber(modeCount);
3907         }
3908     }
3909 }
3910 
3911 unittest // ModeCountOperator
3912 {
3913     auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]];
3914     auto col2File = [["abc", ""], ["def", ""], ["def", "xyz"]];
3915     auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]];
3916 
3917     testSingleFieldOperator!ModeCountOperator(col1File, 0, "mode_count", ["0", "1", "1", "1", "2", "2", "3", "3"]);
3918     testSingleFieldOperator!ModeCountOperator(col2File, 0, "mode_count", ["0", "1", "1", "2"]);
3919     testSingleFieldOperator!ModeCountOperator(col2File, 1, "mode_count", ["0", "1", "2", "2"]);
3920     testSingleFieldOperator!ModeCountOperator(col3File, 0, "mode_count", ["0", "1", "1", "1"]);
3921     testSingleFieldOperator!ModeCountOperator(col3File, 1, "mode_count", ["0", "1", "1", "2"]);
3922     testSingleFieldOperator!ModeCountOperator(col3File, 2, "mode_count", ["0", "1", "1", "1"]);
3923 
3924     auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]];
3925     testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "0", "1", "1", "1", "1", "2", "2", "3"],
3926                                               new MissingFieldPolicy(true, ""));  // Exclude missing
3927 
3928 
3929     testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "1", "1", "2", "2", "2", "2", "2", "3"],
3930                                               new MissingFieldPolicy(false, "X"));  // Replace missing
3931 }
3932 
3933 /** ValuesOperator outputs each value delimited by an alternate delimiter character.
3934  *
3935  * All the field values are stored in memory as part of this calculation. This is
3936  * handled by unique key value lists.
3937  */
3938 
3939 final class ValuesOperator : SingleFieldOperator
3940 {
3941     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3942     {
3943         super("values", fieldIndex, missingPolicy);
3944         setSaveFieldValuesText();
3945     }
3946 
3947     final override SingleFieldCalculator makeCalculator()
3948     {
3949         return new ValuesCalculator(fieldIndex);
3950     }
3951 
3952     final class ValuesCalculator : SingleFieldCalculator
3953     {
3954         this(size_t fieldIndex)
3955         {
3956             super(fieldIndex);
3957         }
3958 
3959         final override ValuesOperator getOperator()
3960         {
3961             return this.outer;
3962         }
3963 
3964         /* Work is done by saving the field values. */
3965         final override void processNextField(const char[] nextField)
3966         { }
3967 
3968         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3969         {
3970             return valuesLists.textValues(fieldIndex).join(printOptions.valuesDelimiter);
3971         }
3972     }
3973 }
3974 
3975 unittest // ValuesOperator
3976 {
3977     auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]];
3978     auto col2File = [["", "50"], ["", "51"], ["xyz", "52"]];
3979     auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "---"]];
3980 
3981     testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e|", "a||b|cd|e||a"]);
3982     testSingleFieldOperator!ValuesOperator(col2File, 0, "values", ["", "", "|", "||xyz"]);
3983     testSingleFieldOperator!ValuesOperator(col2File, 1, "values", ["", "50", "50|51", "50|51|52"]);
3984     testSingleFieldOperator!ValuesOperator(col3File, 0, "values", ["", "z", "z|y", "z|y|w"]);
3985     testSingleFieldOperator!ValuesOperator(col3File, 1, "values", ["", "a", "a|ab", "a|ab|ba"]);
3986     testSingleFieldOperator!ValuesOperator(col3File, 2, "values", ["", "-", "-|--", "-|--|---"]);
3987 
3988     testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e|a"],
3989                                          new MissingFieldPolicy(true, ""));  // Exclude missing
3990 
3991 
3992     testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e|X", "a|X|b|cd|e|X|a"],
3993                                          new MissingFieldPolicy(false, "X"));  // Replace missing
3994 }
3995 
3996 /** UniqueValuesOperator outputs each unique value delimited by an alternate delimiter
3997  * character. Values are output in the order seen.
3998  *
3999  * All unique field values are stored in memory as part of this calculation.
4000  *
4001  */
4002 final class UniqueValuesOperator : SingleFieldOperator
4003 {
4004     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
4005     {
4006         super("unique_values", fieldIndex, missingPolicy);
4007     }
4008 
4009     final override SingleFieldCalculator makeCalculator()
4010     {
4011         return new UniqueValuesCalculator(fieldIndex);
4012     }
4013 
4014     final class UniqueValuesCalculator : SingleFieldCalculator
4015     {
4016         private size_t[string] _valuesHash;
4017         private Appender!(string[]) _uniqueValues;
4018 
4019         this(size_t fieldIndex)
4020         {
4021             super(fieldIndex);
4022         }
4023 
4024         final override UniqueValuesOperator getOperator()
4025         {
4026             return this.outer;
4027         }
4028 
4029         final override void processNextField(const char[] nextField)
4030         {
4031             auto ptr = (nextField in _valuesHash);
4032 
4033             if (ptr is null)
4034             {
4035                 string value = nextField.to!string;
4036                 _uniqueValues.put(value);
4037                 _valuesHash[value] = 1;
4038             }
4039         }
4040 
4041         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
4042         {
4043             return _uniqueValues.data.join(printOptions.valuesDelimiter);
4044         }
4045     }
4046 }
4047 
4048 unittest // UniqueValuesOperator
4049 {
4050     auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]];
4051     auto col2File = [["", "50"], ["", "50"], ["xyz", "52"]];
4052     auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "-"]];
4053 
4054     testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e", "a||b|cd|e"]);
4055     testSingleFieldOperator!UniqueValuesOperator(col2File, 0, "unique_values", ["", "", "", "|xyz"]);
4056     testSingleFieldOperator!UniqueValuesOperator(col2File, 1, "unique_values", ["", "50", "50", "50|52"]);
4057     testSingleFieldOperator!UniqueValuesOperator(col3File, 0, "unique_values", ["", "z", "z|y", "z|y|w"]);
4058     testSingleFieldOperator!UniqueValuesOperator(col3File, 1, "unique_values", ["", "a", "a|ab", "a|ab|ba"]);
4059     testSingleFieldOperator!UniqueValuesOperator(col3File, 2, "unique_values", ["", "-", "-|--", "-|--"]);
4060 
4061     testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e"],
4062                                                  new MissingFieldPolicy(true, ""));  // Exclude missing
4063 
4064 
4065     testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e", "a|X|b|cd|e"],
4066                                                  new MissingFieldPolicy(false, "X"));  // Replace missing
4067 }