tsv_utils.tsv_summarize source code

1 /**
2 Command line tool that reads TSV files and summarizes field values associated with
3 equivalent keys.
4 
5 Copyright (c) 2016-2020, eBay Inc.
6 Initially written by Jon Degenhardt
7 
8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
9 */
10 module tsv_utils.tsv_summarize;
11 
12 import std.algorithm : all, any, canFind, each, find, findSplit, map, joiner, splitter;
13 import std.array : join;
14 import std.conv : to;
15 import std.format : format;
16 import std.range;
17 import std.stdio;
18 import std.typecons : tuple;
19 import std.container : DList;
20 
21 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
22 
23 version(unittest)
24 {
25     // When running unit tests, use main from -main compiler switch.
26 }
27 else
28 {
29     int main(string[] cmdArgs)
30     {
31         /* When running in DMD code coverage mode, turn on report merging. */
32         version(D_Coverage) version(DigitalMars)
33         {
34             import core.runtime : dmd_coverSetMerge;
35             dmd_coverSetMerge(true);
36         }
37 
38         TsvSummarizeOptions cmdopt;
39         auto r = cmdopt.processArgs(cmdArgs);
40         if (!r[0]) return r[1];
41         version(LDC_Profile)
42         {
43             import ldc.profile : resetAll;
44             resetAll();
45         }
46         try tsvSummarize(cmdopt, cmdArgs[1..$]);
47         catch (Exception exc)
48         {
49             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
50             return 1;
51         }
52         return 0;
53     }
54 }
55 
56 auto helpTextVerbose = q"EOS
57 Synopsis: tsv-summarize [options] file [file...]
58 
59 tsv-summarize reads tabular data files (tab-separated by default), tracks
60 field values for each unique key, and runs summarization algorithms. Consider
61 the file data.tsv:
62 
63    make    color   time
64    ford    blue    131
65    chevy   green   124
66    ford    red     128
67    bmw     black   118
68    bmw     black   126
69    ford    blue    122
70 
71 The min and average times for each make is generated by the command:
72 
73    $ tsv-summarize --header --group-by 1 --min 3 --mean 3 data.tsv
74 
75 This produces:
76 
77    make   time_min time_mean
78    ford   122      127
79    chevy  124      124
80    bmw    118      122
81 
82 Using '--group 1,2' will group by both 'make' and 'color'. Omitting the
83 '--group-by' entirely summarizes fields for full file.
84 
85 The program tries to generate useful headers, but custom headers can be
86 specified. Example (using -g and -H shortcuts for --header and --group-by):
87 
88    $ tsv-summarize -H -g 1 --min 3:fastest --mean 3:average data.tsv
89 
90 Most operators take custom headers in a similarly way, generally following:
91 
92   --<operator-name> FIELD[:header]
93 
94 Operators can be specified multiple times. They can also take multiple
95 fields (though not when a custom header is specified). Examples:
96 
97   --median 2,3,4
98   --median 2-5,7-11
99 
100 The quantile operator requires one or more probabilities after the fields:
101 
102   --quantile 2:0.25                // Quantile 1 of field 2
103   --quantile 2-4:0.25,0.5,0.75     // Q1, Median, Q3 of fields 2, 3, 4
104 
105 Summarization operators available are:
106   count       range        mad            values
107   retain      sum          var            unique-values
108   first       mean         stddev         unique-count
109   last        median       mode           missing-count
110   min         quantile     mode-count     not-missing-count
111   max
112 
113 Calculated numeric values are printed to 12 significant digits by default.
114 This can be changed using the '--p|float-precision' option. If six or less
115 it sets the number of significant digits after the decimal point. If
116 greater than six it sets the total number of significant digits.
117 
118 Calculations hold onto the minimum data needed while reading data. A few
119 operations like median keep all data values in memory. These operations will
120 start to encounter performance issues as available memory becomes scarce. The
121 size that can be handled effectively is machine dependent, but often quite
122 large files can be handled.
123 
124 Operations requiring numeric entries will signal an error and terminate
125 processing if a non-numeric entry is found.
126 
127 Missing values are not treated specially by default, this can be changed
128 using the '--x|exclude-missing' or '--r|replace-missing' option. The former
129 turns off processing for missing values, the latter uses a replacement value.
130 
131 Options:
132 EOS";
133 
134 auto helpText = q"EOS
135 Synopsis: tsv-summarize [options] file [file...]
136 
137 tsv-summarize runs aggregation operations on fields in tab-separated value
138 files. Operations can be run against the full input data or grouped by key
139 fields. Use --help-verbose for more extensive help.
140 
141 Options:
142 EOS";
143 
144 /** Command line options - Container and processing. The processArgs method is used to
145  * process the command line.
146  */
147 struct TsvSummarizeOptions {
148     string programName;
149 
150     /* Options set directly by on the command line.. */
151     size_t[] keyFields;                // -g, --group-by
152     bool hasHeader = false;            // --header
153     bool writeHeader = false;          // -w, --write-header
154     char inputFieldDelimiter = '\t';   // --d|delimiter
155     char valuesDelimiter = '|';        // --v|values-delimiter
156     size_t floatPrecision = 12;        // --p|float-precision
157     bool excludeMissing = false;       // --x|exclude-missing
158     string missingValueReplacement;    // --r|replace-missing
159     bool helpVerbose = false;          // --help-verbose
160     bool versionWanted = false;        // --V|version
161     DList!Operator operators;          // Operators, in the order specified.
162     size_t endFieldIndex = 0;          // Derived value. Max field index used plus one.
163     MissingFieldPolicy globalMissingPolicy = new MissingFieldPolicy;   // Derived value.
164 
165     /* Returns a tuple. First value is true if command line arguments were successfully
166      * processed and execution should continue, or false if an error occurred or the user
167      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
168      *
169      * Returning true (execution continues) means args have been validated and derived
170      * values calculated. In addition, field indices have been converted to zero-based.
171      */
172     auto processArgs (ref string[] cmdArgs) {
173         import std.algorithm : any, each;
174         import std.getopt;
175         import std.path : baseName, stripExtension;
176         import std.typecons : Yes, No;
177         import tsv_utils.common.getopt_inorder;
178         import tsv_utils.common.utils :  makeFieldListOptionHandler;
179 
180         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
181 
182         try
183         {
184             arraySep = ",";    // Use comma to separate values in command line options
185             auto r = getoptInorder(
186                 cmdArgs,
187                 "help-verbose",       "              Print full help.", &helpVerbose,
188 
189                 std.getopt.config.caseSensitive,
190                 "V|version",          "              Print version information and exit.", &versionWanted,
191                 std.getopt.config.caseInsensitive,
192 
193                 "g|group-by",         "<field-list>  Fields to use as key.",
194                 keyFields.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex),
195 
196                 std.getopt.config.caseSensitive,
197                 "H|header",           "              Treat the first line of each file as a header.", &hasHeader,
198                 std.getopt.config.caseInsensitive,
199 
200                 "w|write-header",     "              Write an output header even if there is no input header.", &writeHeader,
201                 "d|delimiter",        "CHR           Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &inputFieldDelimiter,
202                 "v|values-delimiter", "CHR           Values delimiter. Default: vertical bar (|). (Single byte UTF-8 characters only.)", &valuesDelimiter,
203                 "p|float-precision",  "NUM           'Precision' to use printing floating point numbers. Affects the number of digits printed and exponent use. Default: 12", &floatPrecision,
204                 "x|exclude-missing",  "              Exclude missing (empty) fields from calculations.", &excludeMissing,
205                 "r|replace-missing",  "STR           Replace missing (empty) fields with STR in calculations.", &missingValueReplacement,
206                 "count",              "              Count occurrences of each unique key ('--g|group-by'), or the total number of records if no key field is specified.", &countOptionHandler,
207                 "count-header",       "STR           Count occurrences of each unique key, like '--count', but use STR as the header.", &countHeaderOptionHandler,
208                 "retain",             "<field-list>  Retain one copy of the field.", &operatorOptionHandler!RetainOperator,
209                 "first",              "<field-list>[:STR]  First value seen.", &operatorOptionHandler!FirstOperator,
210                 "last",               "<field-list>[:STR]  Last value seen.", &operatorOptionHandler!LastOperator,
211                 "min",                "<field-list>[:STR]  Min value. (Numeric fields only.)", &operatorOptionHandler!MinOperator,
212                 "max",                "<field-list>[:STR]  Max value. (Numeric fields only.)", &operatorOptionHandler!MaxOperator,
213                 "range",              "<field-list>[:STR]  Difference between min and max values. (Numeric fields only.)", &operatorOptionHandler!RangeOperator,
214                 "sum",                "<field-list>[:STR]  Sum of the values. (Numeric fields only.)", &operatorOptionHandler!SumOperator,
215                 "mean",               "<field-list>[:STR]  Mean (average). (Numeric fields only.)", &operatorOptionHandler!MeanOperator,
216                 "median",             "<field-list>[:STR]  Median value. (Numeric fields only. Reads all values into memory.)", &operatorOptionHandler!MedianOperator,
217                 "quantile",           "<field-list>:p[,p...][:STR]  Quantiles. One or more fields, then one or more 0.0-1.0 probabilities. (Numeric fields only. Reads all values into memory.)", &quantileOperatorOptionHandler,
218                 "mad",                "<field-list>[:STR]  Median absolute deviation from the median. Raw value, not scaled. (Numeric fields only. Reads all values into memory.)", &operatorOptionHandler!MadOperator,
219                 "var",                "<field-list>[:STR]  Variance. (Sample variance, numeric fields only).", &operatorOptionHandler!VarianceOperator,
220                 "stdev",              "<field-list>[:STR]  Standard deviation. (Sample st.dev, numeric fields only).", &operatorOptionHandler!StDevOperator,
221                 "mode",               "<field-list>[:STR]  Mode. The most frequent value. (Reads all unique values into memory.)", &operatorOptionHandler!ModeOperator,
222                 "mode-count",         "<field-list>[:STR]  Count of the most frequent value. (Reads all unique values into memory.)", &operatorOptionHandler!ModeCountOperator,
223                 "unique-count",       "<field-list>[:STR]  Number of unique values. (Reads all unique values into memory.)", &operatorOptionHandler!UniqueCountOperator,
224                 "missing-count",      "<field-list>[:STR]  Number of missing (empty) fields. Not affected by '--x|exclude-missing' or '--r|replace-missing'.", &operatorOptionHandler!MissingCountOperator,
225                 "not-missing-count",  "<field-list>[:STR]  Number of filled (non-empty) fields. Not affected by '--r|replace-missing'.", &operatorOptionHandler!NotMissingCountOperator,
226                 "values",             "<field-list>[:STR]  All the values, separated by --v|values-delimiter. (Reads all values into memory.)", &operatorOptionHandler!ValuesOperator,
227                 "unique-values",      "<field-list>[:STR]  All the unique values, separated by --v|values-delimiter. (Reads all unique values into memory.)", &operatorOptionHandler!UniqueValuesOperator,
228                 );
229 
230             if (r.helpWanted)
231             {
232                 defaultGetoptPrinter(helpText, r.options);
233                 return tuple(false, 0);
234             }
235             else if (helpVerbose)
236             {
237                 defaultGetoptPrinter(helpTextVerbose, r.options);
238                 return tuple(false, 0);
239             }
240             else if (versionWanted)
241             {
242                 import tsv_utils.common.tsvutils_version;
243                 writeln(tsvutilsVersionNotice("tsv-summarize"));
244                 return tuple(false, 0);
245             }
246 
247             consistencyValidations();
248             derivations();
249         }
250         catch (Exception exc)
251         {
252             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
253             return tuple(false, 1);
254         }
255         return tuple(true, 0);
256     }
257 
258     /* operationOptionHandler functions are callbacks that process command line options
259      * specifying summarization operations. eg. '--max 5', '--last 3:LastEntry'. Handlers
260      * check syntactic correctness and instantiate Operator objects that do the work. This
261      * is also where 1-upped field numbers are converted to 0-based indices.
262      */
263     private void operatorOptionHandler(OperatorClass : SingleFieldOperator)(string option, string optionVal)
264     {
265         import std.range : enumerate;
266         import std.typecons : Yes, No;
267         import tsv_utils.common.utils :  parseFieldList;
268 
269         auto valSplit = findSplit(optionVal, ":");
270 
271         if (valSplit[0].empty || (!valSplit[1].empty && valSplit[2].empty))
272         {
273             throw new Exception(
274                 format("Invalid option value: '--%s %s'. Expected: '--%s <field-list>' or '--%s <field>:<header>'.",
275                        option, optionVal, option, option));
276         }
277 
278         try foreach (fieldNum, fieldIndex;
279                      valSplit[0].to!string
280                      .parseFieldList!(size_t, Yes.convertToZeroBasedIndex).enumerate(1))
281             {
282                 auto op = new OperatorClass(fieldIndex, globalMissingPolicy);
283 
284                 if (!valSplit[2].empty) // Header specified
285                 {
286                     if (fieldNum > 1)
287                     {
288                         throw new Exception(
289                             format("Invalid option: '--%s %s'. Cannot specify a custom header when using multiple fields.",
290                                    option, optionVal));
291                     }
292                     else if (!op.allowCustomHeader)
293                     {
294                         throw new Exception(
295                             format("Invalid option: '--%s %s'. Operator does not support custom headers.",
296                                    option, optionVal));
297                     }
298 
299                     op.setCustomHeader(valSplit[2].to!string);
300                 }
301 
302                 operators.insertBack(op);
303                 if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1;
304             }
305         catch (Exception exc)
306         {
307             import std.format : format;
308             exc.msg = format("[--%s] %s", option, exc.msg);
309             throw exc;
310         }
311     }
312 
313     /* QuantileOperator has a different syntax and needs a custom command option handler. */
314     private void quantileOperatorOptionHandler(string option, string optionVal)
315     {
316         import std.typecons : Yes, No;
317         import tsv_utils.common.utils :  parseFieldList;
318 
319         auto formatErrorMsg(string option, string optionVal)
320         {
321             return format(
322                 "Invalid option value: '--%s %s'. Expected: '--%s <field-list>:<prob>[,<prob>]' or '--%s <field>:<prob>:<header>' where <prob> is a number between 0.0 and 1.0.",
323                 option, optionVal, option, option);
324         }
325 
326         auto split1 = findSplit(optionVal, ":");
327 
328         if (split1[0].empty || (!split1[1].empty && split1[2].empty))
329             throw new Exception(formatErrorMsg(option, optionVal));
330 
331         auto split2 = findSplit(split1[2], ":");
332 
333         if (split2[0].empty || (!split2[1].empty && split2[2].empty))
334             throw new Exception(formatErrorMsg(option, optionVal));
335 
336         auto fieldStr = split1[0];
337         auto probStr = split2[0];
338         auto header = split2[2];
339 
340         size_t[] fieldIndices;
341         double[] probs;
342 
343         try foreach (fieldIndex;
344                      fieldStr.to!string.parseFieldList!(size_t, Yes.convertToZeroBasedIndex))
345             {
346                 fieldIndices ~= fieldIndex;
347             }
348         catch (Exception exc)
349         {
350             import std.format : format;
351             exc.msg = format("[--%s] %s", option, exc.msg);
352             throw exc;
353         }
354 
355         foreach (str; probStr.splitter(','))
356         {
357             double p;
358 
359             try p = str.to!double;
360             catch (Exception exc)
361                 throw new Exception(formatErrorMsg(option, optionVal));
362 
363             if (!(p >= 0.0 && p <= 1.0))
364                 throw new Exception(
365                     format("Invalid option: '--%s %s'. Probability '%g' is not in the interval [0.0,1.0].",
366                            option, optionVal, p));
367 
368             probs ~= p;
369         }
370 
371         if (!header.empty && (fieldIndices.length > 1 || probs.length > 1))
372         {
373             throw new Exception(
374                 format("Invalid option: '--%s %s'. Cannot specify a custom header when using multiple fields or multiple probabilities.",
375                        option, optionVal));
376         }
377 
378         assert (fieldIndices.length > 0);
379         assert (probs.length > 0);
380         assert (header.empty || (fieldIndices.length == 1 && probs.length == 1));
381 
382         foreach (fieldIndex; fieldIndices)
383         {
384             foreach (p; probs)
385             {
386                 auto op = new QuantileOperator(fieldIndex, globalMissingPolicy, p);
387                 if (!header.empty) op.setCustomHeader(header);
388                 operators.insertBack(op);
389             }
390             if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1;
391         }
392     }
393 
394     private void countOptionHandler()
395     {
396         operators.insertBack(new CountOperator());
397     }
398 
399     private void countHeaderOptionHandler(string option, string optionVal)
400     {
401         auto op = new CountOperator();
402         op.setCustomHeader(optionVal);
403         operators.insertBack(op);
404     }
405 
406     /* This routine does validations not handled by processArgs. */
407     private void consistencyValidations()
408     {
409         if (operators.empty)
410         {
411             throw new Exception("At least one summary operator is required.");
412         }
413 
414         if (inputFieldDelimiter == valuesDelimiter)
415         {
416             throw new Exception("Cannot use the same character for both --d|field-delimiter and --v|values-delimiter.");
417         }
418 
419         if (excludeMissing && missingValueReplacement.length != 0)
420         {
421             throw new Exception("Cannot use both '--x|exclude-missing' and '--r|replace-missing'.");
422         }
423     }
424 
425     /* Post-processing derivations. */
426     void derivations()
427     {
428         /* keyFields need to part of the endFieldIndex, which is one past the last field index. */
429         keyFields.each!(delegate (size_t x) { if (x >= endFieldIndex) endFieldIndex = x + 1; } );
430 
431         /* Missing field policy. */
432         globalMissingPolicy.updatePolicy(excludeMissing, missingValueReplacement);
433     }
434 }
435 
436 /** tsvSummarize does the primary work of the tsv-summarize program.
437  */
438 void tsvSummarize(TsvSummarizeOptions cmdopt, const string[] inputFiles)
439 {
440     import tsv_utils.common.utils : bufferedByLine, throwIfWindowsNewlineOnUnix;
441 
442     /* Pick the Summarizer based on the number of key-fields entered. */
443     auto summarizer =
444         (cmdopt.keyFields.length == 0)
445         ? new NoKeySummarizer!(typeof(stdout.lockingTextWriter()))(
446             cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
447 
448         : (cmdopt.keyFields.length == 1)
449         ? new OneKeySummarizer!(typeof(stdout.lockingTextWriter()))(
450             cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
451 
452         : new MultiKeySummarizer!(typeof(stdout.lockingTextWriter()))(
453             cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy);
454 
455     /* Add the operators to the Summarizer. */
456     summarizer.setOperators(inputRangeObject(cmdopt.operators[]));
457 
458     /* Process each input file, one line at a time. */
459     auto lineFields = new char[][](cmdopt.endFieldIndex);
460     bool headerFound = false;
461     foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"])
462     {
463         auto inputStream = (filename == "-") ? stdin : filename.File();
464         foreach (lineNum, line; inputStream.bufferedByLine.enumerate(1))
465         {
466             if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, lineNum);
467 
468             /* Copy the needed number of fields to the fields array.
469              * Note: The number is zero if no operator needs fields. Notably, the count
470              * operator. Used by itself, it counts the number input lines (ala 'wc -l').
471              */
472             if (cmdopt.endFieldIndex > 0)
473             {
474                 size_t fieldIndex = 0;
475                 foreach (fieldValue; line.splitter(cmdopt.inputFieldDelimiter))
476                 {
477                     if (fieldIndex == cmdopt.endFieldIndex) break;
478                     lineFields[fieldIndex] = fieldValue;
479                     fieldIndex++;
480                 }
481 
482                 if (fieldIndex == 0)
483                 {
484                     assert(cmdopt.endFieldIndex > 0);
485                     assert(line.length == 0);
486 
487                     /* Bug work-around. Empty lines are not handled properly by splitter.
488                      *   - Bug: https://issues.dlang.org/show_bug.cgi?id=15735
489                      *   - Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030
490                      * This can arise for: '$ tsv-summarize -g 1 --count'. This counts the
491                      * unique values in field 1. If there's only one column, then an empty
492                      * line becomes an empty string for field 1. Work-around: Point to the
493                      * line. It's an empty string.
494                      */
495                     lineFields[fieldIndex] = line;
496                     fieldIndex++;
497                 }
498 
499                 if (fieldIndex < cmdopt.endFieldIndex)
500                 {
501                     throw new Exception(
502                         format("Not enough fields in line. File: %s, Line: %s",
503                                (filename == "-") ? "Standard Input" : filename, lineNum));
504                 }
505             }
506 
507             if (cmdopt.hasHeader && lineNum == 1)
508             {
509                 if (!headerFound)
510                 {
511                     summarizer.processHeaderLine(lineFields);
512                     headerFound = true;
513                 }
514             }
515             else
516             {
517                 /* Process the line. Processing will fail (throw) if a field cannot be
518                  * converted to the expected type.
519                  */
520                 try summarizer.processNextLine(lineFields);
521                 catch (Exception exc)
522                 {
523                     throw new Exception(
524                         format("Could not process line or field: %s\n  File: %s Line: %s%s",
525                                exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum,
526                                (lineNum == 1) ? "\n  Is this a header line? Use --header to skip." : ""));
527                 }
528             }
529         }
530     }
531 
532     debug writeln("[tsvSummarize] After reading all data.");
533 
534     /* Whew! We're done processing input data. Run the calculations and print. */
535     auto printOptions = SummarizerPrintOptions(
536         cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision);
537     auto stdoutWriter = stdout.lockingTextWriter;
538 
539     if (cmdopt.hasHeader || cmdopt.writeHeader)
540     {
541         summarizer.writeSummaryHeader(stdoutWriter, printOptions);
542     }
543 
544     summarizer.writeSummaryBody(stdoutWriter, printOptions);
545 }
546 
547 /** The default field header. This is used when the input doesn't have field headers,
548  * but field headers are used in the output. The default is "fieldN", where N is the
549  * 1-upped field number.
550  */
551 string fieldHeaderFromIndex(size_t fieldIndex)
552 {
553     enum prefix = "field";
554     return prefix ~ (fieldIndex + 1).to!string;
555 }
556 
557 unittest
558 {
559     assert(fieldHeaderFromIndex(0) == "field1");
560     assert(fieldHeaderFromIndex(10) == "field11");
561 }
562 
563 /** Produce a summary header from a field header.
564  *
565  * The result has the form `<fieldHeader>_<operation>`. e.g. If the field header is
566  * "length" and the operation is "max", the summary header is "length_max". The field
567  * header typically comes a header line in the input data or was constructed by
568  * fieldHeaderFromIndex().
569  *
570  * If operationName is the empty string, then fieldHeader is used unchanged. This supports
571  * the Retain operator.
572  */
573 string summaryHeaderFromFieldHeader(string fieldHeader, string operationName)
574 {
575     return (operationName.length > 0) ? fieldHeader ~ "_" ~ operationName : fieldHeader;
576 }
577 
578 unittest
579 {
580     assert(summaryHeaderFromFieldHeader("originalfield", "mycalc") == "originalfield_mycalc");
581     assert(summaryHeaderFromFieldHeader("originalfield", "") == "originalfield");
582 }
583 
584 /** SummarizerPrintOptions holds printing options for Summarizers and Calculators. Typically
585  * specified with command line options, it is separated out for modularity.
586  */
587 struct SummarizerPrintOptions
588 {
589     char fieldDelimiter;
590     char valuesDelimiter;
591     size_t floatPrecision = 12;
592 
593     import std.traits : isFloatingPoint, isIntegral;
594 
595     auto formatNumber(T)(T n) const
596     if (isFloatingPoint!T || isIntegral!T)
597     {
598         import tsv_utils.common.numerics : formatNumber;
599         return formatNumber!T(n, floatPrecision);
600     }
601 }
602 
603 /** A Summarizer object maintains the state of the summarization and performs basic
604  * processing. Handling of files and input lines is left to the caller.
605  *
606  * Classes supporting the Summarizer must implement the methods:
607  *  - setOperators - Called after initializing the object for each operator to be processed.
608  *  - processHeaderLine - Called to process the header line of each file. Returns true if
609  *   it was the first header line processed (used when reading multiple files).
610  * - processNextLine - Called to process non-header lines.
611  * - writeSummaryHeader - Called to write the header line.
612  * - writeSummaryBody - Called to write the result lines.
613  *
614  */
615 interface Summarizer(OutputRange)
616 {
617     /** Called after initializing the object for each operator to be processed. */
618     void setOperators(InputRange!Operator op);
619 
620     /** Called to process the header line of each file. Returns true if it was the
621      *  first header line processed (used when reading multiple files).
622      */
623     bool processHeaderLine(const char[][] lineFields);
624 
625     /** Called to process non-header lines. */
626     void processNextLine(const char[][] lineFields);
627 
628     /** Called to write the header line. */
629     void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions);
630 
631     /** Called to write the result lines. */
632     void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions);
633 }
634 
635 /** SummarizerBase performs work shared by all sumarizers, most everything except for
636  * handling of unique keys.
637  *
638  * The base class handles creation, allocates storage for Operators and SharedFieldValues,
639  * and similar. Derived classes deal primarily with unique keys and the associated Calculators
640  * and UniqueKeyValuesLists.
641  */
642 class SummarizerBase(OutputRange) : Summarizer!OutputRange
643 {
644     private char _inputFieldDelimiter;
645     private bool _hasProcessedFirstHeaderLine = false;
646     private SharedFieldValues _sharedFieldValues = null;  // Null if no shared field value lists.
647     protected MissingFieldPolicy _missingPolicy;
648     protected DList!Operator _operators;
649     protected size_t _numOperators = 0;
650 
651     this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
652     {
653         _inputFieldDelimiter = inputFieldDelimiter;
654         _missingPolicy = missingPolicy;
655     }
656 
657     char inputFieldDelimiter() const @property
658     {
659         return _inputFieldDelimiter;
660     }
661 
662     /** Sets the Operators used by the Summarizer. Called after construction. */
663     void setOperators(InputRange!Operator operators)
664     {
665         foreach (op; operators)
666         {
667             _operators.insertBack(op);
668             _numOperators++;
669             auto numericFieldsToSave = op.numericFieldsToSave();
670             auto textFieldsToSave = op.textFieldsToSave();
671 
672             if (numericFieldsToSave.length > 0 || textFieldsToSave.length > 0)
673             {
674                 if (_sharedFieldValues is null)
675                 {
676                     _sharedFieldValues = new SharedFieldValues();
677                 }
678                 numericFieldsToSave.each!(x => _sharedFieldValues.addNumericIndex(x));
679                 textFieldsToSave.each!(x => _sharedFieldValues.addTextIndex(x));
680             }
681         }
682     }
683 
684     /** Called to process the header line of each file. Returns true if it was the
685      *  first header line processed (used when reading multiple files).
686      */
687     bool processHeaderLine(const char[][] lineFields)
688     {
689         if (!_hasProcessedFirstHeaderLine)
690         {
691             _operators.each!(x => x.processHeaderLine(lineFields));
692             _hasProcessedFirstHeaderLine = true;
693             return true;
694         }
695         else
696         {
697             return false;
698         }
699     }
700 
701     protected final UniqueKeyValuesLists makeUniqueKeyValuesLists()
702     {
703         return (_sharedFieldValues is null)
704             ? null
705             : _sharedFieldValues.makeUniqueKeyValuesLists;
706     }
707 
708     abstract void processNextLine(const char[][] lineFields);
709     abstract void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions);
710     abstract void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions);
711 }
712 
713 /** The NoKeySummarizer is used when summarizing values across the entire input.
714  *
715  * Note: NoKeySummarizer is used in Operator unit tests and gets extensive testing
716  * through that mechanism.
717  */
718 final class NoKeySummarizer(OutputRange) : SummarizerBase!OutputRange
719 {
720     private Calculator[] _calculators;
721     private UniqueKeyValuesLists _valueLists;
722 
723     this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
724     {
725         super(inputFieldDelimiter, missingPolicy);
726     }
727 
728     /** Called after initializing the object for each operator to be processed. */
729     override void setOperators(InputRange!Operator operators)
730     {
731         super.setOperators(operators);
732 
733         /* Only one Calculator per Operation, so create them as Operators are added. */
734         foreach (op; operators) _calculators ~= op.makeCalculator;
735         _valueLists = super.makeUniqueKeyValuesLists();
736     }
737 
738      /** Called to process non-header lines. */
739     override void processNextLine(const char[][] lineFields)
740     {
741         _calculators.each!(x => x.processNextLine(lineFields));
742         if (_valueLists !is null) _valueLists.processNextLine(lineFields, _missingPolicy);
743     }
744 
745     /** Called to write the header line. */
746     override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
747     {
748         put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter));
749         put(outputStream, '\n');
750     }
751 
752     /** Called to write the result lines. */
753     override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
754     {
755         put(outputStream,
756             _calculators[]
757             .map!(x => x.calculate(_valueLists, printOptions))
758             .join(printOptions.fieldDelimiter));
759         put(outputStream, '\n');
760     }
761 }
762 
763 /** KeySummarizerBase does work shared by the single key and multi-key summarizers.
764  *
765  * The primary difference between those two is the formation of the key. The primary
766  * reason for separating those into two separate classes is to simplify (speed-up)
767  * handling of single field keys, which are the most common use case.
768  */
769 class KeySummarizerBase(OutputRange) : SummarizerBase!OutputRange
770 {
771     protected struct UniqueKeyData
772     {
773         Calculator[] calculators;
774         UniqueKeyValuesLists valuesLists;
775     }
776 
777     private DList!string _uniqueKeys;
778     private UniqueKeyData[string] _uniqueKeyData;
779 
780     this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
781     {
782         super(inputFieldDelimiter, missingPolicy);
783     }
784 
785     protected void processNextLineWithKey(T : const char[])(T key, const char[][] lineFields)
786     {
787         debug writefln("[%s]: %s", __FUNCTION__, lineFields.to!string);
788 
789         auto dataPtr = (key in _uniqueKeyData);
790         auto data = (dataPtr is null) ? addUniqueKey(key.to!string) : *dataPtr;
791 
792         data.calculators.each!(x => x.processNextLine(lineFields));
793         if (data.valuesLists !is null) data.valuesLists.processNextLine(lineFields, _missingPolicy);
794     }
795 
796     protected UniqueKeyData addUniqueKey(string key)
797     {
798         assert(key !in _uniqueKeyData);
799 
800         _uniqueKeys.insertBack(key);
801 
802         auto calculators = new Calculator[_numOperators];
803         size_t i = 0;
804         foreach (op; _operators)
805         {
806             calculators[i] = op.makeCalculator;
807             i++;
808         }
809 
810         return _uniqueKeyData[key] = UniqueKeyData(calculators, super.makeUniqueKeyValuesLists());
811     }
812 
813     override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
814     {
815         put(outputStream, keyFieldHeader());
816         put(outputStream, printOptions.fieldDelimiter);
817         put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter));
818         put(outputStream, '\n');
819     }
820 
821     override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
822     {
823         foreach(key; _uniqueKeys)
824         {
825             auto data = _uniqueKeyData[key];
826             put(outputStream, key);
827             put(outputStream, printOptions.fieldDelimiter);
828             put(outputStream,
829                 data.calculators[]
830                 .map!(x => x.calculate(data.valuesLists, printOptions))
831                 .join(printOptions.fieldDelimiter));
832             put(outputStream, '\n');
833         }
834     }
835 
836     abstract string keyFieldHeader() const @property;
837 }
838 
839 /** This Summarizer is for the case where the unique key is based on exactly one field.
840  */
841 final class OneKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange
842 {
843     private size_t _keyFieldIndex = 0;
844     private string _keyFieldHeader;
845     private DList!string _uniqueKeys;
846 
847     this(size_t keyFieldIndex, char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
848     {
849         super(inputFieldDelimiter, missingPolicy);
850         _keyFieldIndex = keyFieldIndex;
851         _keyFieldHeader = fieldHeaderFromIndex(keyFieldIndex);
852     }
853 
854     override string keyFieldHeader() const @property
855     {
856         return _keyFieldHeader;
857     }
858 
859     override bool processHeaderLine(const char[][] lineFields)
860     {
861         assert(_keyFieldIndex <= lineFields.length);
862 
863         bool isFirstHeaderLine = super.processHeaderLine(lineFields);
864         if (isFirstHeaderLine)
865         {
866             _keyFieldHeader = lineFields[_keyFieldIndex].to!string;
867         }
868         return isFirstHeaderLine;
869     }
870 
871     override void processNextLine(const char[][] lineFields)
872     {
873         assert(_keyFieldIndex < lineFields.length);
874         processNextLineWithKey(lineFields[_keyFieldIndex], lineFields);
875     }
876 }
877 
878 /** This Summarizer is for the case where the unique key is based on multiple fields.
879  */
880 final class MultiKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange
881 {
882     private size_t[] _keyFieldIndices;
883     private string _keyFieldHeader;
884     private DList!string _uniqueKeys;
885 
886     this(const size_t[] keyFieldIndices, char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
887     {
888         super(inputFieldDelimiter, missingPolicy);
889         _keyFieldIndices = keyFieldIndices.dup;
890         _keyFieldHeader =
891             _keyFieldIndices.map!(i => fieldHeaderFromIndex(i))
892             .join(inputFieldDelimiter);
893     }
894 
895     override string keyFieldHeader() const @property
896     {
897         return _keyFieldHeader;
898     }
899 
900     override bool processHeaderLine(const char[][] lineFields)
901     {
902         assert(_keyFieldIndices.all!(x => x < lineFields.length));
903         assert(_keyFieldIndices.length >= 2);
904 
905         bool isFirstHeaderLine = super.processHeaderLine(lineFields);
906         if (isFirstHeaderLine)
907         {
908             _keyFieldHeader = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string;
909         }
910         return isFirstHeaderLine;
911     }
912 
913     override void processNextLine(const char[][] lineFields)
914     {
915         assert(_keyFieldIndices.all!(x => x < lineFields.length));
916         assert(_keyFieldIndices.length >= 2);
917 
918         string key = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string;
919         processNextLineWithKey(key, lineFields);
920     }
921 }
922 
923 version(unittest)
924 {
925     /* testSummarizer is a helper that can run many types of unit tests against
926      * Summarizers. It can also test operators, but there are separate helper functions
927      * better suited for that purpose.
928      *
929      * Arguments are a command line args, an input file, and expected output. The
930      * input file and expected output are already split into lines and fields, the helper
931      * manages re-assembly. The program name from the command line args is printed if an
932      * an error occurs, it is useful to identify the test that failed.
933      *
934      * Note: Much of this is a duplication tsvSummarize logic. Better abstraction of
935      * file input/output would enable running unit tests directly on top of tsvSummarize.
936      */
937     void testSummarizer(string[] cmdArgs, string[][] file, string[][] expected)
938     {
939         import std.array : appender;
940 
941         assert(cmdArgs.length > 0, "[testSummarizer] cmdArgs must not be empty.");
942 
943         auto formatAssertMessage(T...)(string msg, T formatArgs)
944         {
945             auto formatString = "[testSummarizer] %s: " ~ msg;
946             return format(formatString, cmdArgs[0], formatArgs);
947         }
948 
949         TsvSummarizeOptions cmdopt;
950         auto savedCmdArgs = cmdArgs.to!string;
951         auto r = cmdopt.processArgs(cmdArgs);
952         assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
953 
954         assert(file.all!(line => line.length >= cmdopt.endFieldIndex),
955                formatAssertMessage("group-by or operator field number greater than number of fields a line of the input file."));
956 
957         /* Pick the Summarizer based on the number of key-fields entered. */
958         auto summarizer =
959             (cmdopt.keyFields.length == 0)
960             ? new NoKeySummarizer!(typeof(appender!(char[])()))(
961                 cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
962 
963             : (cmdopt.keyFields.length == 1)
964             ? new OneKeySummarizer!(typeof(appender!(char[])()))(
965                 cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
966 
967             : new MultiKeySummarizer!(typeof(appender!(char[])()))(
968                 cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy);
969 
970         /* Add the operators to the Summarizer. */
971         summarizer.setOperators(inputRangeObject(cmdopt.operators[]));
972 
973         /* Process the file one line at a time. */
974         auto lineFields = new char[][](cmdopt.endFieldIndex);
975         bool headerFound = false;
976         foreach (lineNum, line; file.enumerate(1))
977         {
978             /* Copy the needed fields to the fields array. */
979             foreach (i, val; line[0..cmdopt.endFieldIndex]) lineFields[i] = val.dup;
980 
981             if (cmdopt.hasHeader && lineNum == 1)
982             {
983                 if (!headerFound)
984                 {
985                     summarizer.processHeaderLine(lineFields);
986                     headerFound = true;
987                 }
988             }
989             else
990             {
991                 try summarizer.processNextLine(lineFields);
992                 catch (Exception exc)
993                 {
994                     assert(false, formatAssertMessage(exc.msg));
995                 }
996             }
997         }
998         auto printOptions = SummarizerPrintOptions(
999         cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision);
1000 
1001         auto summarizerOutput = appender!(char[])();
1002 
1003         if (cmdopt.hasHeader || cmdopt.writeHeader)
1004         {
1005             summarizer.writeSummaryHeader(summarizerOutput, printOptions);
1006         }
1007 
1008         summarizer.writeSummaryBody(summarizerOutput, printOptions);
1009         auto expectedOutput = expected.map!(x => x.joiner(cmdopt.inputFieldDelimiter.to!string)).joiner("\n").to!string;
1010         if (expectedOutput.length > 0 && expectedOutput[$-1] != '\n') expectedOutput ~= "\n";
1011 
1012         assert(summarizerOutput.data == expectedOutput,
1013                formatAssertMessage(
1014                    "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
1015                    expectedOutput.to!string, summarizerOutput.data.to!string));
1016     }
1017 }
1018 
1019 unittest
1020 {
1021     /* Summarizer unit tests. Primarily single-key and multi-key summarizers. To a limited
1022      * extent, command line option handling (TsvSummarizeOptions). Individual operators
1023      * have separate tests, those tests test the no-key summarizer. The Values operator is
1024      * used in these tests. It engages a number of behaviors, and the results have limited
1025      * ambiguity. Using only one operator limits dependence on individual operators.
1026      */
1027 
1028     auto file1 = [["fld1", "fld2", "fld3"],
1029                   ["a", "a",  "3"],
1030                   ["c", "a",  "2b"],
1031                   ["c", "bc", ""],
1032                   ["a", "c",  "2b"],
1033                   ["",  "bc", ""],
1034                   ["c", "bc", "3"]];
1035 
1036     /* Single-key summarizer tests.
1037      */
1038     testSummarizer(["unittest-sk-1", "--header", "--group-by", "1", "--values", "1"],
1039                    file1,
1040                    [["fld1", "fld1_values"],
1041                     ["a", "a|a"],
1042                     ["c", "c|c|c"],
1043                     ["",  ""]]
1044         );
1045     testSummarizer(["unittest-sk-2", "-H", "--group-by", "1", "--values", "2"],
1046                    file1,
1047                    [["fld1", "fld2_values"],
1048                     ["a", "a|c"],
1049                     ["c", "a|bc|bc"],
1050                     ["",  "bc"]]
1051         );
1052     testSummarizer(["unittest-sk-3", "-H", "-g", "1", "--values", "3"],
1053                    file1,
1054                    [["fld1", "fld3_values"],
1055                     ["a", "3|2b"],
1056                     ["c", "2b||3"],
1057                     ["",  ""]]
1058         );
1059     testSummarizer(["unittest-sk-4", "-H", "--group-by", "1", "--values", "1,2,3"],
1060                    file1,
1061                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1062                     ["a", "a|a",   "a|c",     "3|2b"],
1063                     ["c", "c|c|c", "a|bc|bc", "2b||3"],
1064                     ["",  "",      "bc",      ""]]
1065         );
1066     testSummarizer(["unittest-sk-5", "-H", "--group-by", "1", "--values", "1-3"],
1067                    file1,
1068                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1069                     ["a", "a|a",   "a|c",     "3|2b"],
1070                     ["c", "c|c|c", "a|bc|bc", "2b||3"],
1071                     ["",  "",      "bc",      ""]]
1072         );
1073     testSummarizer(["unittest-sk-6", "-H", "--group-by", "1", "--values", "3,2,1"],
1074                    file1,
1075                    [["fld1", "fld3_values", "fld2_values", "fld1_values"],
1076                     ["a", "3|2b",  "a|c",     "a|a"],
1077                     ["c", "2b||3", "a|bc|bc", "c|c|c"],
1078                     ["",  "",      "bc",      ""]]
1079         );
1080     testSummarizer(["unittest-sk-7", "-H", "--group-by", "1", "--values", "3-1"],
1081                    file1,
1082                    [["fld1", "fld3_values", "fld2_values", "fld1_values"],
1083                     ["a", "3|2b",  "a|c",     "a|a"],
1084                     ["c", "2b||3", "a|bc|bc", "c|c|c"],
1085                     ["",  "",      "bc",      ""]]
1086         );
1087     testSummarizer(["unittest-sk-8", "-H", "--group-by", "2", "--values", "1"],
1088                    file1,
1089                    [["fld2", "fld1_values"],
1090                     ["a",  "a|c"],
1091                     ["bc", "c||c"],
1092                     ["c",  "a"]]
1093         );
1094     testSummarizer(["unittest-sk-9", "-H", "--group-by", "2", "--values", "2"],
1095                    file1,
1096                    [["fld2", "fld2_values"],
1097                     ["a",  "a|a"],
1098                     ["bc", "bc|bc|bc"],
1099                     ["c",  "c"]]
1100         );
1101     testSummarizer(["unittest-sk-10", "-H", "--group-by", "2", "--values", "3"],
1102                    file1,
1103                    [["fld2", "fld3_values"],
1104                     ["a",  "3|2b"],
1105                     ["bc", "||3"],
1106                     ["c",  "2b"]]
1107         );
1108     testSummarizer(["unittest-sk-11", "-H", "--group-by", "2", "--values", "1,3"],
1109                    file1,
1110                    [["fld2", "fld1_values", "fld3_values"],
1111                     ["a",  "a|c",  "3|2b"],
1112                     ["bc", "c||c", "||3"],
1113                     ["c",  "a",    "2b"]]
1114         );
1115     testSummarizer(["unittest-sk-12", "-H", "--group-by", "2", "--values", "3,1"],
1116                    file1,
1117                    [["fld2", "fld3_values", "fld1_values"],
1118                     ["a",  "3|2b", "a|c"],
1119                     ["bc", "||3",  "c||c"],
1120                     ["c",  "2b",   "a"]]
1121         );
1122     testSummarizer(["unittest-sk-13", "-H", "--group-by", "3", "--values", "1"],
1123                    file1,
1124                    [["fld3", "fld1_values"],
1125                     ["3",  "a|c"],
1126                     ["2b", "c|a"],
1127                     ["",   "c|"]]
1128         );
1129     testSummarizer(["unittest-sk-14", "-H", "--group-by", "3", "--values", "2"],
1130                    file1,
1131                    [["fld3", "fld2_values"],
1132                     ["3",  "a|bc"],
1133                     ["2b", "a|c"],
1134                     ["",   "bc|bc"]]
1135         );
1136     testSummarizer(["unittest-sk-15", "-H", "--group-by", "3", "--values", "1,2"],
1137                    file1,
1138                    [["fld3", "fld1_values", "fld2_values"],
1139                     ["3",  "a|c", "a|bc"],
1140                     ["2b", "c|a", "a|c"],
1141                     ["",   "c|",  "bc|bc"]]
1142         );
1143 
1144     /* Multi-key summarizer tests.
1145      */
1146     testSummarizer(["unittest-mk-1", "--header", "--group-by", "1,2", "--values", "1"],
1147                    file1,
1148                    [["fld1", "fld2", "fld1_values"],
1149                     ["a", "a",  "a"],
1150                     ["c", "a",  "c"],
1151                     ["c", "bc", "c|c"],
1152                     ["a", "c",  "a"],
1153                     ["", "bc",  ""]]
1154         );
1155     testSummarizer(["unittest-mk-2", "-H", "--group-by", "1,2", "--values", "2"],
1156                    file1,
1157                    [["fld1", "fld2", "fld2_values"],
1158                     ["a", "a",  "a"],
1159                     ["c", "a",  "a"],
1160                     ["c", "bc", "bc|bc"],
1161                     ["a", "c",  "c"],
1162                     ["", "bc",  "bc"]]
1163         );
1164     testSummarizer(["unittest-mk-3", "-H", "--group-by", "1,2", "--values", "3"],
1165                    file1,
1166                    [["fld1", "fld2", "fld3_values"],
1167                     ["a", "a",  "3"],
1168                     ["c", "a",  "2b"],
1169                     ["c", "bc", "|3"],
1170                     ["a", "c",  "2b"],
1171                     ["", "bc",  ""]]
1172         );
1173     testSummarizer(["unittest-mk-4", "-H", "--group-by", "1,2", "--values", "3,1"],
1174                    file1,
1175                    [["fld1", "fld2", "fld3_values", "fld1_values"],
1176                     ["a", "a",  "3", "a"],
1177                     ["c", "a",  "2b", "c"],
1178                     ["c", "bc", "|3", "c|c"],
1179                     ["a", "c",  "2b", "a"],
1180                     ["",  "bc", "",   ""]]
1181         );
1182     testSummarizer(["unittest-mk-5", "-H", "--group-by", "3,2", "--values", "1"],
1183                    file1,
1184                    [["fld3", "fld2", "fld1_values"],
1185                     ["3",  "a",  "a"],
1186                     ["2b", "a",  "c"],
1187                     ["",   "bc", "c|"],
1188                     ["2b", "c",  "a"],
1189                     ["3",  "bc", "c"]]
1190         );
1191     testSummarizer(["unittest-mk-6", "-H", "--group-by", "3-2", "--values", "1"],
1192                    file1,
1193                    [["fld3", "fld2", "fld1_values"],
1194                     ["3",  "a",  "a"],
1195                     ["2b", "a",  "c"],
1196                     ["",   "bc", "c|"],
1197                     ["2b", "c",  "a"],
1198                     ["3",  "bc", "c"]]
1199         );
1200     testSummarizer(["unittest-mk-7", "-H", "--group-by", "2,1,3", "--values", "2"],
1201                    file1,
1202                    [["fld2", "fld1", "fld3", "fld2_values"],
1203                     ["a",  "a", "3",  "a"],
1204                     ["a",  "c", "2b", "a"],
1205                     ["bc", "c", "",   "bc"],
1206                     ["c",  "a", "2b", "c"],
1207                     ["bc", "",  "",   "bc"],
1208                     ["bc", "c", "3",  "bc"]]
1209         );
1210 
1211     /* Missing policies. */
1212     testSummarizer(["unittest-mis-1", "--header", "--group-by", "1", "--values", "1", "--exclude-missing"],
1213                    file1,
1214                    [["fld1", "fld1_values"],
1215                     ["a", "a|a"],
1216                     ["c", "c|c|c"],
1217                     ["",  ""]]
1218         );
1219     testSummarizer(["unittest-mis-2", "-H", "--group-by", "1", "--values", "2", "-x"],
1220                    file1,
1221                    [["fld1", "fld2_values"],
1222                     ["a", "a|c"],
1223                     ["c", "a|bc|bc"],
1224                     ["",  "bc"]]
1225         );
1226     testSummarizer(["unittest-mis-3", "-H", "-g", "1", "--values", "3", "-x"],
1227                    file1,
1228                    [["fld1", "fld3_values"],
1229                     ["a", "3|2b"],
1230                     ["c", "2b|3"],
1231                     ["",  ""]]
1232         );
1233     testSummarizer(["unittest-mis-4", "-H", "--group-by", "1", "--values", "1,2,3", "-x"],
1234                    file1,
1235                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1236                     ["a", "a|a",   "a|c",     "3|2b"],
1237                     ["c", "c|c|c", "a|bc|bc", "2b|3"],
1238                     ["",  "",      "bc",      ""]]
1239         );
1240     testSummarizer(["unittest-mis-5", "--header", "--group-by", "1", "--values", "1", "--replace-missing", "NA"],
1241                    file1,
1242                    [["fld1", "fld1_values"],
1243                     ["a", "a|a"],
1244                     ["c", "c|c|c"],
1245                     ["",  "NA"]]
1246         );
1247     testSummarizer(["unittest-mis-6", "-H", "--group-by", "1", "--values", "2", "-r", "NA"],
1248                    file1,
1249                    [["fld1", "fld2_values"],
1250                     ["a", "a|c"],
1251                     ["c", "a|bc|bc"],
1252                     ["",  "bc"]]
1253         );
1254     testSummarizer(["unittest-mis-7", "-H", "-g", "1", "--values", "3", "-r", "NA"],
1255                    file1,
1256                    [["fld1", "fld3_values"],
1257                     ["a", "3|2b"],
1258                     ["c", "2b|NA|3"],
1259                     ["",  "NA"]]
1260         );
1261     testSummarizer(["unittest-mis-8", "-H", "--group-by", "1", "--values", "1,2,3", "-r", "NA"],
1262                    file1,
1263                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1264                     ["a", "a|a",   "a|c",     "3|2b"],
1265                     ["c", "c|c|c", "a|bc|bc", "2b|NA|3"],
1266                     ["",  "NA",      "bc",      "NA"]]
1267         );
1268     testSummarizer(["unittest-mis-9", "-H", "--group-by", "1,2", "--values", "3,1", "-x"],
1269                    file1,
1270                    [["fld1", "fld2", "fld3_values", "fld1_values"],
1271                     ["a", "a",  "3", "a"],
1272                     ["c", "a",  "2b", "c"],
1273                     ["c", "bc", "3", "c|c"],
1274                     ["a", "c",  "2b", "a"],
1275                     ["",  "bc", "",   ""]]
1276         );
1277     testSummarizer(["unittest-mis-10", "-H", "--group-by", "3,2", "--values", "1", "-x"],
1278                    file1,
1279                    [["fld3", "fld2", "fld1_values"],
1280                     ["3",  "a",  "a"],
1281                     ["2b", "a",  "c"],
1282                     ["",   "bc", "c"],
1283                     ["2b", "c",  "a"],
1284                     ["3",  "bc", "c"]]
1285         );
1286     testSummarizer(["unittest-mis-11", "-H", "--group-by", "2,1,3", "--values", "2", "-x"],
1287                    file1,
1288                    [["fld2", "fld1", "fld3", "fld2_values"],
1289                     ["a",  "a", "3",  "a"],
1290                     ["a",  "c", "2b", "a"],
1291                     ["bc", "c", "",   "bc"],
1292                     ["c",  "a", "2b", "c"],
1293                     ["bc", "",  "",   "bc"],
1294                     ["bc", "c", "3",  "bc"]]
1295         );
1296     testSummarizer(["unittest-mis-12", "-H", "--group-by", "1,2", "--values", "3,1", "-r", "NA"],
1297                    file1,
1298                    [["fld1", "fld2", "fld3_values", "fld1_values"],
1299                     ["a", "a",  "3", "a"],
1300                     ["c", "a",  "2b", "c"],
1301                     ["c", "bc", "NA|3", "c|c"],
1302                     ["a", "c",  "2b", "a"],
1303                     ["",  "bc", "NA",   "NA"]]
1304         );
1305     testSummarizer(["unittest-mis-13", "-H", "--group-by", "3,2", "--values", "1", "-r", "NA"],
1306                    file1,
1307                    [["fld3", "fld2", "fld1_values"],
1308                     ["3",  "a",  "a"],
1309                     ["2b", "a",  "c"],
1310                     ["",   "bc", "c|NA"],
1311                     ["2b", "c",  "a"],
1312                     ["3",  "bc", "c"]]
1313         );
1314     testSummarizer(["unittest-mis-14", "-H", "--group-by", "2,1,3", "--values", "2", "-r", "NA"],
1315                    file1,
1316                    [["fld2", "fld1", "fld3", "fld2_values"],
1317                     ["a",  "a", "3",  "a"],
1318                     ["a",  "c", "2b", "a"],
1319                     ["bc", "c", "",   "bc"],
1320                     ["c",  "a", "2b", "c"],
1321                     ["bc", "",  "",   "bc"],
1322                     ["bc", "c", "3",  "bc"]]
1323         );
1324 
1325     /* Validate that the no-key summarizer works with testSummarizer helper function.
1326      */
1327     testSummarizer(["unittest-nk-1", "-H", "--values", "1,2"],
1328                    file1,
1329                    [["fld1_values", "fld2_values"],
1330                     ["a|c|c|a||c", "a|a|bc|c|bc|bc"]]
1331         );
1332 
1333     /* Header variations: no header line; auto-generated header line; custom headers.
1334      */
1335     testSummarizer(["unittest-hdr-1", "--group-by", "1", "--values", "1"],
1336                    file1[1..$],
1337                    [["a", "a|a"],
1338                     ["c", "c|c|c"],
1339                     ["",  ""]]
1340         );
1341     testSummarizer(["unittest-hdr-2", "--group-by", "1,2", "--values", "2"],
1342                    file1[1..$],
1343                    [["a", "a",  "a"],
1344                     ["c", "a",  "a"],
1345                     ["c", "bc", "bc|bc"],
1346                     ["a", "c",  "c"],
1347                     ["", "bc",  "bc"]]
1348         );
1349     testSummarizer(["unittest-hdr-3", "--write-header", "--group-by", "2", "--values", "1"],
1350                    file1[1..$],
1351                    [["field2", "field1_values"],
1352                     ["a",  "a|c"],
1353                     ["bc", "c||c"],
1354                     ["c",  "a"]]
1355         );
1356     testSummarizer(["unittest-hdr-4", "-w", "--group-by", "3,2", "--values", "1"],
1357                    file1[1..$],
1358                    [["field3", "field2", "field1_values"],
1359                     ["3",  "a",  "a"],
1360                     ["2b", "a",  "c"],
1361                     ["",   "bc", "c|"],
1362                     ["2b", "c",  "a"],
1363                     ["3",  "bc", "c"]]
1364         );
1365     testSummarizer(["unittest-hdr-5", "-H", "--group-by", "2", "--values", "3:Field3Values"],
1366                    file1,
1367                    [["fld2", "Field3Values"],
1368                     ["a",  "3|2b"],
1369                     ["bc", "||3"],
1370                     ["c",  "2b"]]
1371         );
1372     testSummarizer(["unittest-hdr-6", "-H", "--group-by", "1,2", "--values", "3:FieldThreeValues", "--values", "1:FieldOneValues"],
1373                    file1,
1374                    [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"],
1375                     ["a", "a",  "3", "a"],
1376                     ["c", "a",  "2b", "c"],
1377                     ["c", "bc", "|3", "c|c"],
1378                     ["a", "c",  "2b", "a"],
1379                     ["",  "bc", "",   ""]]
1380         );
1381     testSummarizer(["unittest-hdr-7", "--write-header", "--group-by", "1", "--values", "3:f3_vals","--values", "2:f2_vals", "--values", "1:f1_vals"],
1382                    file1[1..$],
1383                    [["field1", "f3_vals", "f2_vals", "f1_vals"],
1384                     ["a", "3|2b",  "a|c",     "a|a"],
1385                     ["c", "2b||3", "a|bc|bc", "c|c|c"],
1386                     ["",  "",      "bc",      ""]]
1387         );
1388     testSummarizer(["unittest-hdr-8", "--write-header", "--group-by", "1,3,2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2"],
1389                    file1[1..$],
1390                    [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"],
1391                     ["a", "3",  "a",  "3",  "a", "a"],
1392                     ["c", "2b", "a",  "2b", "c", "a"],
1393                     ["c", "",   "bc", "",   "c", "bc"],
1394                     ["a", "2b", "c",  "2b", "a", "c"],
1395                     ["",  "",   "bc", "",   "",  "bc"],
1396                     ["c", "3",  "bc", "3",  "c", "bc"]]
1397         );
1398     testSummarizer(["unittest-hdr-9", "--write-header", "--group-by", "1,3-2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2"],
1399                    file1[1..$],
1400                    [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"],
1401                     ["a", "3",  "a",  "3",  "a", "a"],
1402                     ["c", "2b", "a",  "2b", "c", "a"],
1403                     ["c", "",   "bc", "",   "c", "bc"],
1404                     ["a", "2b", "c",  "2b", "a", "c"],
1405                     ["",  "",   "bc", "",   "",  "bc"],
1406                     ["c", "3",  "bc", "3",  "c", "bc"]]
1407         );
1408 
1409     /* Alternate file widths and lengths.
1410      */
1411 
1412     auto file3x2 = [["fld1", "fld2", "fld3"],
1413                     ["a", "b", "c"],
1414                     ["c", "b", "a"]];
1415 
1416     testSummarizer(["unittest-3x2-1", "-H", "--group-by", "1", "--values", "3"],
1417                    file3x2,
1418                    [["fld1", "fld3_values"],
1419                     ["a", "c"],
1420                     ["c", "a"]]
1421         );
1422     testSummarizer(["unittest-3x2-2", "-H", "--group-by", "2", "--values", "3"],
1423                    file3x2,
1424                    [["fld2", "fld3_values"],
1425                     ["b", "c|a"]]
1426         );
1427     testSummarizer(["unittest-3x2-3", "-H", "--group-by", "2,1", "--values", "3"],
1428                    file3x2,
1429                    [["fld2", "fld1", "fld3_values"],
1430                     ["b", "a", "c"],
1431                     ["b", "c", "a"]]
1432         );
1433 
1434     auto file3x1 = [["fld1", "fld2", "fld3"],
1435                     ["a", "b", "c"]];
1436 
1437     testSummarizer(["unittest-3x1-1", "-H", "--group-by", "1", "--values", "3"],
1438                    file3x1,
1439                    [["fld1", "fld3_values"],
1440                     ["a", "c"]]
1441         );
1442     testSummarizer(["unittest-3x1-2", "--group-by", "1", "--values", "3"],
1443                    file3x1[1..$],
1444                    [["a", "c"]]
1445         );
1446     testSummarizer(["unittest-3x1-3", "-H", "--group-by", "2,1", "--values", "3"],
1447                    file3x1,
1448                    [["fld2", "fld1", "fld3_values"],
1449                     ["b", "a", "c"]]
1450         );
1451     testSummarizer(["unittest-3x1-4", "--group-by", "2,1", "--values", "3"],
1452                    file3x1[1..$],
1453                    [["b", "a", "c"]]
1454         );
1455 
1456     auto file3x0 = [["fld1", "fld2", "fld3"]];
1457 
1458     testSummarizer(["unittest-3x0-1", "-H", "--group-by", "1", "--values", "3"],
1459                    file3x0,
1460                    [["fld1", "fld3_values"]]
1461         );
1462     testSummarizer(["unittest-3x0-2", "--group-by", "1", "--values", "3"],
1463                    file3x0[1..$],
1464                    []
1465         );
1466     testSummarizer(["unittest-3x0-3", "--write-header", "--group-by", "1", "--values", "3"],
1467                    file3x0[1..$],
1468                    [["field1", "field3_values"]]
1469         );
1470 
1471 
1472     testSummarizer(["unittest-3x0-4", "-H", "--group-by", "2,1", "--values", "3"],
1473                    file3x0,
1474                    [["fld2", "fld1", "fld3_values"]]
1475         );
1476 
1477     testSummarizer(["unittest-3x0-5", "--group-by", "2,1", "--values", "3"],
1478                    file3x0[1..$],
1479                    []
1480         );
1481 
1482     testSummarizer(["unittest-3x0-6", "--write-header", "--group-by", "2,1", "--values", "3"],
1483                    file3x0[1..$],
1484                    [["field2", "field1", "field3_values"]]
1485         );
1486 
1487     auto file2x1 = [["fld1", "fld2"],
1488                     ["a", "b"]];
1489 
1490     testSummarizer(["unittest-2x1-1", "-H", "--group-by", "1", "--values", "2"],
1491                    file2x1,
1492                    [["fld1", "fld2_values"],
1493                     ["a", "b"]]
1494         );
1495     testSummarizer(["unittest-2x1-2", "-H", "--group-by", "2,1", "--values", "1"],
1496                    file2x1,
1497                    [["fld2", "fld1", "fld1_values"],
1498                     ["b", "a", "a"]]
1499         );
1500 
1501     auto file2x0 = [["fld1", "fld2"]];
1502 
1503     testSummarizer(["unittest-2x0-1", "-H", "--group-by", "1", "--values", "2"],
1504                    file2x0,
1505                    [["fld1", "fld2_values"]]
1506         );
1507     testSummarizer(["unittest-2x0-2", "-H", "--group-by", "2,1", "--values", "1"],
1508                    file2x0,
1509                    [["fld2", "fld1", "fld1_values"]]
1510         );
1511 
1512     auto file1x2 = [["fld1"],
1513                     ["a"],
1514                     [""]];
1515 
1516     testSummarizer(["unittest-1x2-1", "-H", "--group-by", "1", "--values", "1"],
1517                    file1x2,
1518                    [["fld1", "fld1_values"],
1519                     ["a", "a"],
1520                     ["",  ""]]
1521         );
1522 
1523     auto file1x2b = [["fld1"],
1524                      [""],
1525                      [""]];
1526 
1527     testSummarizer(["unittest-1x2b-2", "-H", "--group-by", "1", "--values", "1"],
1528                    file1x2b,
1529                    [["fld1", "fld1_values"],
1530                     ["", "|"]]
1531         );
1532 
1533     auto file1x1 = [["fld1"],
1534                     ["x"]];
1535 
1536     testSummarizer(["unittest-1x1-1", "-H", "--group-by", "1", "--values", "1"],
1537                    file1x1,
1538                    [["fld1", "fld1_values"],
1539                     ["x", "x"]]
1540         );
1541 
1542     testSummarizer(["unittest-1x1-2", "--group-by", "1", "--values", "1"],
1543                    file1x1[1..$],
1544                    [["x", "x"]]
1545         );
1546 
1547     testSummarizer(["unittest-1x1-3", "--write-header", "--group-by", "1", "--values", "1"],
1548                    file1x1[1..$],
1549                    [["field1", "field1_values"],
1550                     ["x", "x"]]
1551         );
1552 
1553     auto file1x1b = [["fld1"],
1554                     [""]];
1555 
1556     testSummarizer(["unittest-1x1b-1", "-H", "--group-by", "1", "--values", "1"],
1557                    file1x1b,
1558                    [["fld1", "fld1_values"],
1559                     ["", ""]]
1560         );
1561 
1562     auto file1x0 = [["fld1"]];
1563 
1564     testSummarizer(["unittest-1x0-1", "-H", "--group-by", "1", "--values", "1"],
1565                    file1x0,
1566                    [["fld1", "fld1_values"]]
1567         );
1568 
1569     testSummarizer(["unittest-1x0-2", "--group-by", "1", "--values", "1"],
1570                    file1x0[1..$],
1571                    []
1572         );
1573 
1574     testSummarizer(["unittest-1x0-3", "--write-header", "--group-by", "1", "--values", "1"],
1575                    file1x0[1..$],
1576                    [["field1", "field1_values"]]
1577         );
1578 
1579     /* Alternate delimiters. */
1580     testSummarizer(["unittest-delim-1", "-H", "--values", "1,2", "--delimiter", "%"],
1581                    file1,
1582                    [["fld1_values", "fld2_values"],
1583                     ["a|c|c|a||c", "a|a|bc|c|bc|bc"]]
1584         );
1585     testSummarizer(["unittest-delim-2", "-H", "--values", "1-2", "--values-delimiter", "$"],
1586                    file1,
1587                    [["fld1_values", "fld2_values"],
1588                     ["a$c$c$a$$c", "a$a$bc$c$bc$bc"]]
1589         );
1590     testSummarizer(["unittest-delim-3", "-H", "--values", "1,2", "--delimiter", "#", "--values-delimiter", ","],
1591                    file1,
1592                    [["fld1_values", "fld2_values"],
1593                     ["a,c,c,a,,c", "a,a,bc,c,bc,bc"]]
1594         );
1595     testSummarizer(["unittest-delim-4", "--write-header", "--group-by", "2", "--values", "1",
1596                     "--delimiter", "^", "--values-delimiter", ":"],
1597                    file1[1..$],
1598                    [["field2", "field1_values"],
1599                     ["a",  "a:c"],
1600                     ["bc", "c::c"],
1601                     ["c",  "a"]]
1602         );
1603     testSummarizer(["unittest-delim-5", "--group-by", "1,2", "--values", "2", "--delimiter", "/",
1604                     "--values-delimiter", "\\"],
1605                    file1[1..$],
1606                    [["a", "a",  "a"],
1607                     ["c", "a",  "a"],
1608                     ["c", "bc", "bc\\bc"],
1609                     ["a", "c",  "c"],
1610                     ["", "bc",  "bc"]]
1611         );
1612 }
1613 
1614 /* Summary Operators and Calculators
1615  *
1616  * Two types of objects are used in implementation: Operators and Calculators. An Operator
1617  * represents a summary calculation specified on the command line, e.g. '--mean 5'. A
1618  * Calculator is used to manage the summary calculation for each unique key in the input.
1619  *
1620  * As an example, consider the command:
1621  *
1622  *    $tsv-summarize --group-by 1 --mean 3 --mean 5
1623  *
1624  * This command will create two instances of a MeanOperator, one each for fields 3 and 5.
1625  * They produce the output field headers (e.g. "field3_mean", "field5_mean"). They also
1626  * create MeanCalculator objects for each unique value in field 1. For 'mean', a
1627  * calculator needs to track occurrence count and sum. Calculators produce the final
1628  * value when all processing is finished.
1629  *
1630  * Summary field headers
1631  *
1632  * There are several options for specifying summary field headers. The defaults combine the
1633  * operator name and the header of the field summarized. The defaults can be overridden on
1634  * on the command line. These scenarios are supported via the operator constructor and the
1635  * processHeaderLine() method.
1636  *
1637  * Missing field policy
1638  *
1639  * At present, tsv-summarize has a single policy for handling missing values that applies
1640  * to all operators. However, it is logically operator specific and is implemented that
1641  * way. The MissingFieldPolicy struct describes the policy, each operator contains one.
1642  * Calculators access thier operator's policy struct.
1643  */
1644 
1645 /** An Operator represents a summary calculation specified on the command line.
1646  *  e.g. '--mean 5'.
1647  */
1648 interface Operator
1649 {
1650     @property string header();
1651     @property string name();
1652     void processHeaderLine(const char[][] fields);
1653     size_t[] numericFieldsToSave();     // Numeric fields this Operator needs saved
1654     size_t[] textFieldsToSave();        // Text fields this Operator needs saved
1655     Calculator makeCalculator();
1656 }
1657 
1658 /** Calculators are responsible for the calculation of a single computation. They
1659  *  process each line and produce the final value when all processing is finished.
1660  */
1661 interface Calculator
1662 {
1663     void processNextLine(const char[][] fields);
1664     string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions);
1665 }
1666 
1667 /** This class describes processing behavior when a missing value is encountered.
1668  */
1669 final class MissingFieldPolicy
1670 {
1671     private bool _useMissing = true;          // True if missing values are processed unchanged.
1672     private bool _replaceMissing = false;     // True if missing values are replaced.
1673     private string _missingReplacement;       // Replacement string if replaceMissing is true.
1674 
1675     this (const bool excludeMissing = false, string missingReplacement = "")
1676     {
1677         updatePolicy(excludeMissing, missingReplacement);
1678     }
1679 
1680     void updatePolicy(const bool excludeMissing, string missingReplacement)
1681     {
1682         _missingReplacement = missingReplacement;
1683         _replaceMissing = missingReplacement.length != 0;
1684         _useMissing = !excludeMissing && !replaceMissing;
1685     }
1686 
1687     final bool isMissingField(const char[] field) const
1688     {
1689         return field.length == 0;
1690     }
1691 
1692     final bool useMissing() const @property
1693     {
1694         return _useMissing;
1695     }
1696 
1697     final bool excludeMissing() const @property
1698     {
1699         return !_useMissing && !_replaceMissing;
1700     }
1701 
1702     final bool replaceMissing() const @property
1703     {
1704         return _replaceMissing;
1705     }
1706 
1707     final string missingReplacement() const @property
1708     {
1709         return _missingReplacement;
1710     }
1711 }
1712 
1713 /* The SharedFieldValues and UniqueKeyValuesLists classes manage lists of values collected
1714  * while reading data. Operations like median collect all values and operate on them when
1715  * running the final calculation. Value lists are needed for each unique key. A command
1716  * using multiple Operators may save multiple fields. And, different Operators may be run
1717  * against the same field.
1718  *
1719  * The last part motivates these classes. Handling large data sets necessitates minimizing
1720  * in-memory storage, making it desirable to share identical lists between Calculators.
1721  * Otherwise, each Calculator could implement its own storage, which would be simpler.
1722  *
1723  * The setup works as follows:
1724  *  - Operators advertise fields they need saved ([text|numeric]FieldsToSave methods).
1725  *  - The SummarizerBase object keeps a SharedFieldValues object, which in turn keeps list
1726  *    of the fields advertised by Operators as needing sharing. This list gets created
1727  *    during command initialization (SummarizerBase.setOperators).
1728  *  - The SharedFieldValues object is used to create a UniqueKeyValuesLists object every
1729  *    time a new unique key is found, in parellel to the Calculator objects created for the
1730  *    key. The UniqueKeyValuesLists objects are managed by derived Summarizer classes.
1731  *  - A unique key's UniqueKeyValuesLists object is passed each input line, same as
1732  *    Calculators, saving the values.
1733  *  - Calculators retrieve the saved values during the calculation phase. The calculator's
1734  *    ProcessNextField method is typically a no-op.
1735  *  - Calculators cannot make assumptions about the order of the saved values. This is
1736  *    pragmatic concession to median and quantile calculations, which need to sort the data,
1737  *    at least partially. Rather than generate sorted copies, the current algorithms
1738  *    sort the data in place.
1739  *
1740  * One concession to duplicate storage is that text and numeric versions of the same
1741  * field might be stored. The reason is because it's important to convert text to numbers
1742  * as they are read so that useful error messages can be generated. And, storing both
1743  * forms of the same field should be less common.
1744  *
1745  * The current implementation uses the same missing values policy for all fields. If
1746  * multiple policies become supported this will need to change.
1747  *
1748  * Built-in calculations - UniqueKeyValueLists have a built-in median operation. This is
1749  * to avoid repeated calculations of the median by different calculations.
1750  */
1751 
1752 final class SharedFieldValues
1753 {
1754     // Arrays with field indices that need to be saved.
1755     private size_t[] _numericFieldIndices;
1756     private size_t[] _textFieldIndices;
1757 
1758     /* Called during summarizer setup to add a shared field value for a specific field index.
1759      * eg. '--median 7' will add end up calling addNumericIdex(6), 6 being the zero-based index.
1760      * A specific index is only added once.
1761      */
1762     final void addNumericIndex (size_t index)
1763     {
1764         if (!canFind(_numericFieldIndices, index)) _numericFieldIndices ~= index;
1765     }
1766 
1767     /* Similar to addNumericIndex, except adds a text index. */
1768     final void addTextIndex (size_t index)
1769     {
1770         if (!canFind(_textFieldIndices, index)) _textFieldIndices ~= index;
1771     }
1772 
1773     /* Called every time a new key is found, or once at the beginning of the program if no keys
1774      * are being used (entire column summarized).
1775      */
1776     final UniqueKeyValuesLists makeUniqueKeyValuesLists()
1777     {
1778         return new UniqueKeyValuesLists(_numericFieldIndices, _textFieldIndices);
1779     }
1780 }
1781 
1782 final class UniqueKeyValuesLists
1783 {
1784     /* A FieldValues object holds is a list of values collect for a specific field. A
1785      * unique key may hold several. For example, the command:
1786      *     $ tsv-summarize --k 1 --median 4 -- median 5
1787      * requires keeping lists for both fields 4 and 5. This in turn will result in a
1788      * _numericFieldValues being a 2 element array, one with a list of field 4 values,
1789      * the second of field 5 values. Linear search is used to find a specific field.
1790      */
1791     private FieldValues!double[] _numericFieldValues;
1792     private FieldValues!string[] _textFieldValues;
1793     private double[] _numericFieldMedians;
1794 
1795     /* The UniqueKeyValuesLists constructor takes arrays of field indices to be saved. */
1796     this(const size_t[] numericFieldIndices, const size_t[] textFieldIndices)
1797     {
1798         if (numericFieldIndices.length > 0)
1799         {
1800             _numericFieldValues = new FieldValues!double[](numericFieldIndices.length);
1801             foreach (i, fieldIndex; numericFieldIndices)
1802                 _numericFieldValues[i] = new FieldValues!double(fieldIndex);
1803         }
1804 
1805         if (textFieldIndices.length > 0)
1806         {
1807             _textFieldValues = new FieldValues!string[](textFieldIndices.length);
1808             foreach (i, fieldIndex; textFieldIndices)
1809                 _textFieldValues[i] = new FieldValues!string(fieldIndex);
1810         }
1811     }
1812 
1813     void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy)
1814     {
1815         _numericFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy));
1816         _textFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy));
1817     }
1818 
1819     private FieldValues!double findNumericFieldValues(size_t index)
1820     {
1821         alias pred = (FieldValues!double a, size_t b) => (a.fieldIndex == b);
1822         auto r = find!pred(_numericFieldValues, index);
1823         assert(!r.empty);
1824         return r.front;
1825     }
1826 
1827     private FieldValues!string findTextFieldValues(size_t index)
1828     {
1829         alias pred = (FieldValues!string a, size_t b) => (a.fieldIndex == b);
1830         auto r = find!pred(_textFieldValues, index);
1831         assert(!r.empty);
1832         return r.front;
1833     }
1834 
1835     final double[] numericValues(size_t index)
1836     {
1837         return findNumericFieldValues(index).getArray;
1838     }
1839 
1840     final double[] numericValuesSorted(size_t index)
1841     {
1842         return findNumericFieldValues(index).getSortedArray;
1843     }
1844 
1845     final string[] textValues(size_t index)
1846     {
1847         return findTextFieldValues(index).getArray;
1848     }
1849 
1850     final string[] textValuesSorted(size_t index)
1851     {
1852         return findTextFieldValues(index).getSortedArray;
1853     }
1854 
1855     final double numericValuesMedian(size_t index)
1856     {
1857         return findNumericFieldValues(index).median;
1858     }
1859 
1860     private final class FieldValues(ValueType)
1861     {
1862         import std.array : appender;
1863         private size_t _fieldIndex;
1864         private Appender!(ValueType[]) _values;
1865         private bool _haveMedian = false;
1866         private bool _isSorted = false;
1867         private ValueType _medianValue;
1868 
1869         this(size_t fieldIndex)
1870         {
1871             _fieldIndex = fieldIndex;
1872         }
1873 
1874         final size_t length() const @property
1875         {
1876             return _values.data.length;
1877         }
1878 
1879         final size_t fieldIndex() const @property
1880         {
1881             return _fieldIndex;
1882         }
1883 
1884         final void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy)
1885         {
1886             debug writefln("[%s]: %s", __FUNCTION__, fields.to!string);
1887 
1888             const char[] field = fields[_fieldIndex];
1889             if (missingPolicy.useMissing || !missingPolicy.isMissingField(field))
1890             {
1891                 _values.put(field.to!ValueType);
1892                 _haveMedian = false;
1893                 _isSorted = false;
1894             }
1895             else if (missingPolicy.replaceMissing)
1896             {
1897                 _values.put(missingPolicy.missingReplacement.to!ValueType);
1898                 _haveMedian = false;
1899                 _isSorted = false;
1900             }
1901         }
1902 
1903         /* Return an input range of the values. */
1904         final auto values()
1905         {
1906             return _values.data;
1907         }
1908 
1909         final ValueType[] getArray()
1910         {
1911             return _values.data;
1912         }
1913 
1914         final ValueType[] getSortedArray()
1915         {
1916             if (!_isSorted)
1917             {
1918                 import std.algorithm : sort;
1919                 sort(_values.data);
1920                 _isSorted = true;
1921             }
1922             return _values.data;
1923         }
1924 
1925         final ValueType median()
1926         {
1927             if (!_haveMedian)
1928             {
1929                 import tsv_utils.common.numerics : rangeMedian;
1930                 _medianValue = _values.data.rangeMedian();
1931                 _haveMedian = true;
1932             }
1933 
1934             return _medianValue;
1935         }
1936     }
1937 }
1938 
1939 /** SingleFieldOperator is a base class for single field operators, the most common
1940  * Operator. Derived classes implement makeCalculator and the Calculator class it returns.
1941  */
1942 class SingleFieldOperator : Operator
1943 {
1944     import std.typecons : Flag;
1945 
1946     private string _name;
1947     private string _header;
1948     private size_t _fieldIndex;
1949     private bool _useHeaderSuffix;
1950     private bool _allowCustomHeader;
1951     private bool _hasCustomHeader = false;
1952     private size_t[] _numericFieldsToSave;
1953     private size_t[] _textFieldsToSave;
1954     private MissingFieldPolicy _missingPolicy;
1955 
1956     this(string operatorName, size_t fieldIndex, MissingFieldPolicy missingPolicy,
1957          Flag!"useHeaderSuffix" useHeaderSuffix = Yes.useHeaderSuffix,
1958          Flag!"allowCustomHeader" allowCustomHeader = Yes.allowCustomHeader)
1959     {
1960         _name = operatorName;
1961         _fieldIndex = fieldIndex;
1962         _missingPolicy = missingPolicy;
1963         _useHeaderSuffix = useHeaderSuffix;
1964         _allowCustomHeader = allowCustomHeader;
1965         // Default header. May be overrridden by custom header or header line.
1966         _header =
1967             fieldHeaderFromIndex(fieldIndex)
1968             .summaryHeaderFromFieldHeader(_useHeaderSuffix ? operatorName : "");
1969     }
1970 
1971     void setCustomHeader (string customHeader)
1972     {
1973         assert(_allowCustomHeader);
1974         _header = customHeader;
1975         _hasCustomHeader = true;
1976     }
1977 
1978     final string name() const @property
1979     {
1980         return _name;
1981     }
1982 
1983     final bool allowCustomHeader() const @property
1984     {
1985         return _allowCustomHeader;
1986     }
1987 
1988     /* saveFieldValues[Numeric|Text] are called by derived classes to indicate that field
1989      * that the field values should be saved. These should called during construction.
1990      */
1991     final void setSaveFieldValuesNumeric()
1992     {
1993         _numericFieldsToSave ~= _fieldIndex;
1994     }
1995 
1996     final void setSaveFieldValuesText()
1997     {
1998         _textFieldsToSave ~= _fieldIndex;
1999     }
2000 
2001     final MissingFieldPolicy missingPolicy() @property
2002     {
2003         return _missingPolicy;
2004     }
2005 
2006     final size_t fieldIndex() const @property
2007     {
2008         return _fieldIndex;
2009     }
2010 
2011     final string header() const @property
2012     {
2013         return _header;
2014     }
2015 
2016     final bool useHeaderSuffix() const @property
2017     {
2018         return _useHeaderSuffix;
2019     }
2020 
2021     void processHeaderLine(const char[][] fields)
2022     {
2023         if (!_hasCustomHeader) {
2024             debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string);
2025             _header = summaryHeaderFromFieldHeader(fields[_fieldIndex].to!string,
2026                                                    _useHeaderSuffix ? _name : "");
2027         }
2028     }
2029 
2030     final size_t[] numericFieldsToSave()
2031     {
2032         return _numericFieldsToSave;
2033     }
2034 
2035     final size_t[] textFieldsToSave()
2036     {
2037         return _textFieldsToSave;
2038     }
2039 
2040     abstract SingleFieldCalculator makeCalculator();
2041 }
2042 
2043 /** SingleFieldCalculator is a base class for the common case of calculators using a single
2044  * field. Derived classes implement processNextField() rather than processNextLine().
2045  */
2046 class SingleFieldCalculator : Calculator
2047 {
2048     private size_t _fieldIndex;
2049 
2050     this(size_t fieldIndex)
2051     {
2052         _fieldIndex = fieldIndex;
2053     }
2054 
2055     final size_t fieldIndex() const @property
2056     {
2057         return _fieldIndex;
2058     }
2059 
2060     final void processNextLine(const char[][] fields)
2061     {
2062         debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string);
2063 
2064         auto missingPolicy = getOperator.missingPolicy;
2065         const char[] field = fields[_fieldIndex];
2066 
2067         if (missingPolicy.useMissing || !missingPolicy.isMissingField(field))
2068         {
2069             processNextField(field);
2070         }
2071         else if (missingPolicy.replaceMissing)
2072         {
2073             processNextField(missingPolicy.missingReplacement);
2074         }
2075     }
2076 
2077     abstract SingleFieldOperator getOperator();
2078 
2079     abstract void processNextField(const char[] field);
2080 }
2081 
2082 /* Unittest helper functions. Only compiled when -unittest is in effect. */
2083 version(unittest)
2084 {
2085     /** A helper for SingleFieldOperator unit tests.
2086      *
2087      * testSingleFieldOperator takes a set of split file values, a field index, a header
2088      * suffix, and a set of expected values. The expected values array contains the
2089      * initial value (zero entries) and the expected values after each line. (One more
2090      * expected value than input lines.) The zero entry case is what is generated for an
2091      * empty file. An example testing the 'min' operator against a file with 2 columns,
2092      * 3 rows, using field index 1:
2093      *
2094      *    testSingleFieldOperator!MinOperator(
2095      *       [["10", "100"],               // The split file. 3 lines by 2 rows.
2096      *        ["5", "50"],
2097      *        ["20", "200"]],
2098      *       1,                            // Field index (zero-based, so "100", "50", "200")
2099      *       "min",                        // The header suffix, normally the operator name.
2100      *       ["nan", "100", "50", "50"]);  // Min value after processing each line.
2101      *
2102      * A typical operator unit test uses three "files", one each of 1x3, 2x3, and 3x3.
2103      * Then run the operator is tested against each column, a total of six calls. Headers
2104      * are automatically checked. Additional entries can be used to extend coverage.
2105      *
2106      * A non-default MissingFieldPolicy can be provide as an optional last argument.
2107      * Operator tests should include exclusion and replacement variations. See operator
2108      * unit tests for details.
2109      *
2110      * The testSingleFieldOperatorBase adds an additional capability - Custom operator
2111      * init arguments. Currently this is used only by the quantile operator.
2112      *
2113      * These tests do not check unique key behavior (group-by). Operators don't have info
2114      * about unique keys, and interact with them only indirectly, via Calculators.
2115      */
2116     void testSingleFieldOperator(OperatorClass : SingleFieldOperator)
2117         (const char[][][] splitFile, size_t fieldIndex, string headerSuffix,
2118          const char[][] expectedValues,
2119          MissingFieldPolicy missingPolicy = new MissingFieldPolicy)
2120     {
2121         testSingleFieldOperatorBase!OperatorClass(splitFile, fieldIndex, headerSuffix, expectedValues, missingPolicy);
2122     }
2123 
2124     void testSingleFieldOperatorBase(OperatorClass : SingleFieldOperator, T...)
2125         (const char[][][] splitFile, size_t fieldIndex, string headerSuffix,
2126          const char[][] expectedValues,
2127          MissingFieldPolicy missingPolicy,
2128          T extraOpInitArgs)
2129     {
2130         import std.format : format;
2131         import std.array : appender;
2132         import std.string : chomp;
2133         import std.traits : EnumMembers;
2134 
2135         auto numFields = (splitFile[0]).length;
2136 
2137         assert(fieldIndex < numFields,
2138                format("[testSingleFieldOperator] Invalid field index. headerSuffix: %s",
2139                       headerSuffix));
2140         assert(splitFile.length + 1 == expectedValues.length,
2141                format("[testSingleFieldOperator] Need one more expected value than number of rows. headerSuffix: %s",
2142                       headerSuffix));
2143 
2144         /* printOptions - Only the 'values-delimiter' (2nd arg) is used these tests. */
2145         auto printOptions = SummarizerPrintOptions('#', '|');
2146 
2147         /* An input header line. */
2148         string[] inputHeaderLine = new string[numFields];
2149         foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string;
2150 
2151         /* The different expected output field headers. */
2152         auto outputFieldHeaderWithNoHeaderLine =
2153             fieldHeaderFromIndex(fieldIndex)
2154             .summaryHeaderFromFieldHeader(headerSuffix);
2155         auto outputFieldHeaderFromHeaderLine =
2156             inputHeaderLine[fieldIndex]
2157             .summaryHeaderFromFieldHeader(headerSuffix);
2158         auto customOutputFieldHeader = "custom";
2159 
2160         enum HeaderUsecase {
2161             HeaderLine_DefaultHeader,
2162             HeaderLine_CustomHeader,
2163             NoHeaderLine_DefaultHeader,
2164             NoHeaderLine_CustomHeader,
2165             NoHeaderLine_NoOutputHeader,
2166         }
2167 
2168         string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected)
2169         {
2170             return format("[testSingleFieldOperator] Unexpected header. Operator: %s; Usecase: %s;  Actual: '%s';  Expected: '%s'",
2171                           op.name, hc, actual, expected);
2172         }
2173 
2174         string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex, size_t fieldIndex,
2175                                   const char[] actual, const char[] expected)
2176         {
2177             return format("[testSingleFieldOperator] Unexpected value. Operator: %s; Usecase: %s;  RowIndex: %d, FieldIndex: %d\n    Actual: '%s';  Expected: '%s'",
2178                           op.name, hc, rowIndex, fieldIndex, actual, expected);
2179         }
2180 
2181         /* Run the logic for each header use case. */
2182         foreach (hc; EnumMembers!HeaderUsecase)
2183         {
2184             bool hasInputHeader = (
2185                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2186                 hc == HeaderUsecase.HeaderLine_CustomHeader
2187                 );
2188             bool hasOutputHeader = (
2189                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2190                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2191                 hc == HeaderUsecase.NoHeaderLine_DefaultHeader ||
2192                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2193                 );
2194             bool hasCustomHeader = (
2195                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2196                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2197                 );
2198 
2199             if (hasCustomHeader) assert(hasOutputHeader);
2200 
2201             auto op = new OperatorClass(fieldIndex, missingPolicy, extraOpInitArgs);
2202 
2203             if (hasCustomHeader)
2204             {
2205                 if (!op.allowCustomHeader) continue;   // Custom header not support by this operator
2206                 op.setCustomHeader(customOutputFieldHeader);
2207             }
2208 
2209             Operator[] operatorArray;
2210             operatorArray ~= op;
2211 
2212             auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy);
2213             summarizer.setOperators(inputRangeObject(operatorArray));
2214 
2215             if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine);
2216 
2217             if (hasOutputHeader)
2218             {
2219                 /* Write the header line. Note that this is a one-field header, */
2220                 auto headerLineOutput = appender!(char[])();
2221                 summarizer.writeSummaryHeader(headerLineOutput, printOptions);
2222 
2223                 /* Test that the header was generated correctly.
2224                  *
2225                  * Note: Because the output is generated by a Summarizer, it will have a
2226                  * trailing newline. Use chomp to trim it.
2227                  */
2228                 final switch (hc)
2229                 {
2230                 case HeaderUsecase.HeaderLine_DefaultHeader:
2231                     assert(headerLineOutput.data.chomp == outputFieldHeaderFromHeaderLine,
2232                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2233                                                outputFieldHeaderFromHeaderLine));
2234                     break;
2235                 case HeaderUsecase.NoHeaderLine_DefaultHeader:
2236                     assert(headerLineOutput.data.chomp == outputFieldHeaderWithNoHeaderLine,
2237                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2238                                                outputFieldHeaderWithNoHeaderLine));
2239                     break;
2240                 case HeaderUsecase.HeaderLine_CustomHeader:
2241                 case HeaderUsecase.NoHeaderLine_CustomHeader:
2242                     assert(headerLineOutput.data.chomp == customOutputFieldHeader,
2243                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2244                                                customOutputFieldHeader));
2245                     break;
2246                 case HeaderUsecase.NoHeaderLine_NoOutputHeader:
2247                     break;
2248                }
2249 
2250             }
2251 
2252             /* For each line, process the line, generate the output, and test that the
2253              * value is correct. Start with the empty file case.
2254              */
2255             foreach (i, const char[] expected; expectedValues)
2256             {
2257                 if (i > 0) summarizer.processNextLine(splitFile[i - 1]);
2258                 auto summaryLineOutput = appender!(char[])();
2259                 summarizer.writeSummaryBody(summaryLineOutput, printOptions);
2260                 assert(summaryLineOutput.data.chomp == expected,
2261                        valueAssertMessage(operatorArray[0], hc, i, fieldIndex,
2262                                           summaryLineOutput.data.chomp, expectedValues[i]));
2263             }
2264         }
2265     }
2266 }
2267 
2268 /** ZeroFieldOperator is a base class for operators that take no input. The main use
2269  * case is the CountOperator, which counts the occurrences of each unique key. Other
2270  * uses are possible, for example, weighted random number assignment.
2271  *
2272  * The primary rationale for ZeroFieldOperator and ZeroFieldCalculator is to clarify
2273  * the information available to such a routine. In particular, the split fields passed
2274  * to processHeaderLine and processNextLine don't include all fields in the input,
2275  * something that might not be obvious when implementing an operator. (Only fields
2276  * required by operators acting on specific fields are included.)
2277  */
2278 class ZeroFieldOperator : Operator
2279 {
2280     import std.typecons : Flag;
2281 
2282     private string _name;
2283     private string _header;
2284 
2285     this(string operatorName)
2286     {
2287         _name = operatorName;
2288         _header = operatorName;
2289     }
2290 
2291     void setCustomHeader (string customHeader)
2292     {
2293         _header = customHeader;
2294     }
2295 
2296     bool allowCustomHeader() const @property
2297     {
2298         return true;
2299     }
2300 
2301     final string name() const @property
2302     {
2303         return _name;
2304     }
2305 
2306     final string header() const @property
2307     {
2308         return _header;
2309     }
2310 
2311     /* A no-op. ZeroFieldOperators have no access to the header line. */
2312     final void processHeaderLine(const char[][] fields) { }
2313 
2314     /* A no-op. ZeroFieldOperators have no access to fields. */
2315     final size_t[] numericFieldsToSave()
2316     {
2317         size_t[] emptyArray;
2318         return emptyArray;
2319     }
2320 
2321     /* A no-op. ZeroFieldOperators have no access to fields. */
2322     final size_t[] textFieldsToSave()
2323     {
2324         size_t[] emptyArray;
2325         return emptyArray;
2326     }
2327 
2328     abstract ZeroFieldCalculator makeCalculator();
2329 }
2330 
2331 /** ZeroFieldCalculator is a base class for operators that don't use fields as input.
2332  * In particular, the Count operator. It is a companion to the ZeroFieldOperator class.
2333  *
2334  * Derived classes implement processNextEntry() rather than processNextLine(), and the
2335  * single argument form of calculate() given as an abstract function.
2336  */
2337 class ZeroFieldCalculator : Calculator
2338 {
2339     this() { }
2340 
2341     final void processNextLine(const char[][] fields)
2342     {
2343         debug writefln("[%s]", __FUNCTION__,);
2344         processNextEntry();
2345     }
2346 
2347     final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2348     {
2349         return calculate(printOptions);
2350     }
2351 
2352     abstract void processNextEntry();
2353     abstract string calculate(const ref SummarizerPrintOptions printOptions);
2354 }
2355 
2356 version(unittest)
2357 {
2358     /* A helper for ZeroFieldOperator unit tests.
2359      *
2360      * testZeroFieldOperator takes a set of split file values, a default header, and a
2361      * set of expected values. The expected values array contains the expected values
2362      * after each line.
2363      *
2364      * testZeroFieldOperator is very similar to testSingleFieldOperator, except that
2365      * there is no use of field indices and fewer types of headers. See the latter's
2366      * documentation and the CountOperator unit tests for examples.
2367      */
2368     void testZeroFieldOperator(OperatorClass : ZeroFieldOperator)
2369         (const char[][][] splitFile, string defaultHeader, const char[][] expectedValues)
2370     {
2371         import std.format : format;
2372         import std.array : appender;
2373         import std.string : chomp;
2374         import std.traits : EnumMembers;
2375 
2376         auto numFields = (splitFile[0]).length;
2377 
2378         assert(splitFile.length + 1 == expectedValues.length,
2379                format("[testZeroFieldOperator] Need one more expected value than number of rows. headerSuffix: %s",
2380                       defaultHeader));
2381 
2382         /* printOptions - Not used these tests, but needed for API calls. */
2383         auto printOptions = SummarizerPrintOptions('#', '|');
2384 
2385         /* Missing policy doesn't apply to zero field operators, but need the object for the summarizer. */
2386         auto missingPolicy = new MissingFieldPolicy;
2387 
2388         /* An input header line. */
2389         string[] inputHeaderLine = new string[numFields];
2390         foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string;
2391 
2392         auto customOutputFieldHeader = "custom";
2393 
2394         enum HeaderUsecase {
2395             HeaderLine_DefaultHeader,
2396             HeaderLine_CustomHeader,
2397             NoHeaderLine_DefaultHeader,
2398             NoHeaderLine_CustomHeader,
2399             NoHeaderLine_NoOutputHeader,
2400         }
2401 
2402         string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected)
2403         {
2404             return format("[testZeroFieldOperator] Unexpected header. Operator: %s; Usecase: %s;  Actual: '%s';  Expected: '%s'",
2405                           op.name, hc, actual, expected);
2406         }
2407 
2408         string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex,
2409                                   const char[] actual, const char[] expected)
2410         {
2411             return format("[testZeroFieldOperator] Unexpected value. Operator: %s; Usecase: %s;  RowIndex: %d\n    Actual: '%s';  Expected: '%s'",
2412                           op.name, hc, rowIndex, actual, expected);
2413         }
2414 
2415         /* Run the logic for each header use case. */
2416         foreach (hc; EnumMembers!HeaderUsecase)
2417         {
2418             bool hasInputHeader = (
2419                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2420                 hc == HeaderUsecase.HeaderLine_CustomHeader
2421                 );
2422             bool hasOutputHeader = (
2423                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2424                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2425                 hc == HeaderUsecase.NoHeaderLine_DefaultHeader ||
2426                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2427                 );
2428             bool hasCustomHeader = (
2429                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2430                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2431                 );
2432 
2433             if (hasCustomHeader) assert(hasOutputHeader);
2434 
2435             auto op = new OperatorClass();
2436 
2437             if (hasCustomHeader)
2438             {
2439                 if (!op.allowCustomHeader) continue;   // Custom header not support by this operator
2440                 op.setCustomHeader(customOutputFieldHeader);
2441             }
2442 
2443             Operator[] operatorArray;
2444             operatorArray ~= op;
2445 
2446             auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy);
2447             summarizer.setOperators(inputRangeObject(operatorArray));
2448             if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine);
2449 
2450             if (hasOutputHeader)
2451             {
2452                 /* Write the header line. Note that this is a one-field header, */
2453                 auto headerLineOutput = appender!(char[])();
2454                 summarizer.writeSummaryHeader(headerLineOutput, printOptions);
2455 
2456                 /* Test that the header was generated correctly.
2457                  *
2458                  * Note: Because the output is generated by a Summarizer, it will have a
2459                  * trailing newline. Use chomp to trim it.
2460                  */
2461                 final switch (hc)
2462                 {
2463                 case HeaderUsecase.HeaderLine_DefaultHeader:
2464                 case HeaderUsecase.NoHeaderLine_DefaultHeader:
2465                     assert(headerLineOutput.data.chomp == defaultHeader,
2466                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2467                                                defaultHeader));
2468                     break;
2469                 case HeaderUsecase.HeaderLine_CustomHeader:
2470                 case HeaderUsecase.NoHeaderLine_CustomHeader:
2471                     assert(headerLineOutput.data.chomp == customOutputFieldHeader,
2472                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2473                                                customOutputFieldHeader));
2474                     break;
2475                 case HeaderUsecase.NoHeaderLine_NoOutputHeader:
2476                     break;
2477                 }
2478 
2479             }
2480 
2481             /* For each line, process the line, generate the output, and test that the
2482              * value is correct. Start with the empty file case.
2483              */
2484             foreach (i, const char[] expected; expectedValues)
2485             {
2486                 if (i > 0) summarizer.processNextLine(splitFile[i - 1]);
2487                 auto summaryLineOutput = appender!(char[])();
2488                 summarizer.writeSummaryBody(summaryLineOutput, printOptions);
2489                 assert(summaryLineOutput.data.chomp == expected,
2490                        valueAssertMessage(operatorArray[0], hc, i,
2491                                           summaryLineOutput.data.chomp, expectedValues[i]));
2492             }
2493         }
2494     }
2495 }
2496 
2497 /* Specific operators.
2498  *
2499  * Notes:
2500  * - The 'Calculator' inner classes are 'static'. This means inner class instances do not
2501  *   keep a reference to the context of the outer class. In exchange, Calculator instances
2502  *   need to hold all needed state, typically the field index they are summarizing.
2503  */
2504 
2505 /** CountOperator counts the number of occurrences of each unique key, or the number of
2506  * input lines if there is no unique key.
2507  *
2508  * CountOperator differs from most other operators in that it doesn't summarize a specific
2509  * field on the line. Instead it is summarizing a property of the unique key itself. For
2510  * this reason it doesn't derive from SingleFieldOperator.
2511  */
2512 final class CountOperator : ZeroFieldOperator
2513 {
2514     this()
2515     {
2516         super("count");
2517     }
2518 
2519     final override ZeroFieldCalculator makeCalculator()
2520     {
2521         return new CountCalculator();
2522     }
2523 
2524     static final class CountCalculator : ZeroFieldCalculator
2525     {
2526         private size_t _count = 0;
2527 
2528         final override void processNextEntry()
2529         {
2530             _count++;
2531         }
2532 
2533         final override string calculate(const ref SummarizerPrintOptions printOptions)
2534         {
2535             return printOptions.formatNumber(_count);
2536         }
2537     }
2538 }
2539 
2540 unittest // CountOperator
2541 {
2542     auto col1File = [["10"], ["9.5"], ["11"]];
2543     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
2544     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
2545 
2546     testZeroFieldOperator!CountOperator(col1File, "count", ["0", "1", "2", "3"]);
2547     testZeroFieldOperator!CountOperator(col2File, "count", ["0", "1", "2", "3"]);
2548     testZeroFieldOperator!CountOperator(col3File, "count", ["0", "1", "2", "3"]);
2549 }
2550 
2551 /** RetainOperator retains the first occurrence of a field, without changing the header.
2552  *
2553  * RetainOperator is intended for fields where the value is expected to be the same for
2554  * all occurrences of the unique key, and the goal is to pass the value through unchanged.
2555  * It is like FirstOperator, except that the original header is preserved. The original
2556  * header preservation is setup in the call to the SingleFieldOperation constructor.
2557  *
2558  * Notes:
2559  * - An option to signal an error if multiple values are encountered might be useful.
2560  */
2561 final class RetainOperator : SingleFieldOperator
2562 {
2563     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2564     {
2565         super("retain", fieldIndex, missingPolicy, No.useHeaderSuffix, No.allowCustomHeader);
2566     }
2567 
2568     final override SingleFieldCalculator makeCalculator()
2569     {
2570         return new RetainCalculator(fieldIndex);
2571     }
2572 
2573     final class RetainCalculator : SingleFieldCalculator
2574     {
2575         private bool _done = false;
2576         private string _value = "";
2577 
2578         this(size_t fieldIndex)
2579         {
2580             super(fieldIndex);
2581         }
2582 
2583         final override RetainOperator getOperator()
2584         {
2585             return this.outer;
2586         }
2587 
2588         final override void processNextField(const char[] nextField)
2589         {
2590             if (!_done)
2591             {
2592                 _value = nextField.to!string;
2593                 _done = true;
2594             }
2595         }
2596 
2597         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2598        {
2599             return _value;
2600         }
2601     }
2602 }
2603 
2604 unittest // RetainOperator
2605 {
2606     auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]];
2607     auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]];
2608     auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]];
2609 
2610     testSingleFieldOperator!RetainOperator(col1File, 0, "", ["", "r1c1", "r1c1", "r1c1"]);
2611     testSingleFieldOperator!RetainOperator(col2File, 0, "", ["", "r1c1", "r1c1", "r1c1"]);
2612     testSingleFieldOperator!RetainOperator(col2File, 1, "", ["", "r1c2", "r1c2", "r1c2"]);
2613     testSingleFieldOperator!RetainOperator(col3File, 0, "", ["", "r1c1", "r1c1", "r1c1"]);
2614     testSingleFieldOperator!RetainOperator(col3File, 1, "", ["", "r1c2", "r1c2", "r1c2"]);
2615     testSingleFieldOperator!RetainOperator(col3File, 2, "", ["", "r1c3", "r1c3", "r1c3"]);
2616 
2617     auto col1misFile = [[""], ["r2c1"], ["r3c1"]];
2618     testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "", "r2c1", "r2c1"],
2619                                            new MissingFieldPolicy(true, ""));  // Exclude missing
2620     testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "NA", "NA", "NA"],
2621                                            new MissingFieldPolicy(false, "NA"));  // Replace missing
2622 }
2623 
2624 /** FirstOperator outputs the first value found for the field.
2625  */
2626 final class FirstOperator : SingleFieldOperator
2627 {
2628     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2629     {
2630         super("first", fieldIndex, missingPolicy);
2631     }
2632 
2633     final override SingleFieldCalculator makeCalculator()
2634     {
2635         return new FirstCalculator(fieldIndex);
2636     }
2637 
2638     final class FirstCalculator : SingleFieldCalculator
2639     {
2640         private bool _done = false;
2641         private string _value = "";
2642 
2643         this(size_t fieldIndex)
2644         {
2645             super(fieldIndex);
2646         }
2647 
2648         final override FirstOperator getOperator()
2649         {
2650             return this.outer;
2651         }
2652 
2653         final override void processNextField(const char[] nextField)
2654         {
2655             if (!_done)
2656             {
2657                 _value = nextField.to!string;
2658                 _done = true;
2659             }
2660         }
2661 
2662         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2663         {
2664             return _value;
2665         }
2666     }
2667 }
2668 
2669 unittest // FirstOperator
2670 {
2671     auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]];
2672     auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]];
2673     auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]];
2674 
2675     testSingleFieldOperator!FirstOperator(col1File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]);
2676     testSingleFieldOperator!FirstOperator(col2File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]);
2677     testSingleFieldOperator!FirstOperator(col2File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]);
2678     testSingleFieldOperator!FirstOperator(col3File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]);
2679     testSingleFieldOperator!FirstOperator(col3File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]);
2680     testSingleFieldOperator!FirstOperator(col3File, 2, "first", ["", "r1c3", "r1c3", "r1c3"]);
2681 
2682     auto col1misFile = [[""], ["r2c1"], ["r3c1"]];
2683     testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "", "r2c1", "r2c1"],
2684                                           new MissingFieldPolicy(true, ""));  // Exclude missing
2685     testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "NA", "NA", "NA"],
2686                                           new MissingFieldPolicy(false, "NA"));  // Replace missing
2687 }
2688 
2689 /** LastOperator outputs the last value found for the field.
2690  */
2691 final class LastOperator : SingleFieldOperator
2692 {
2693     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2694     {
2695         super("last", fieldIndex, missingPolicy);
2696     }
2697 
2698     final override SingleFieldCalculator makeCalculator()
2699     {
2700         return new LastCalculator(fieldIndex);
2701     }
2702 
2703     final class LastCalculator : SingleFieldCalculator
2704     {
2705         private string _value = "";
2706 
2707         this(size_t fieldIndex)
2708         {
2709             super(fieldIndex);
2710         }
2711 
2712         final override LastOperator getOperator()
2713         {
2714             return this.outer;
2715         }
2716 
2717         final override void processNextField(const char[] nextField)
2718         {
2719             _value = nextField.to!string;
2720         }
2721 
2722         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2723         {
2724             return _value;
2725         }
2726     }
2727 }
2728 
2729 unittest // LastOperator
2730 {
2731     auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]];
2732     auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]];
2733     auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]];
2734 
2735     testSingleFieldOperator!LastOperator(col1File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]);
2736     testSingleFieldOperator!LastOperator(col2File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]);
2737     testSingleFieldOperator!LastOperator(col2File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]);
2738     testSingleFieldOperator!LastOperator(col3File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]);
2739     testSingleFieldOperator!LastOperator(col3File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]);
2740     testSingleFieldOperator!LastOperator(col3File, 2, "last", ["", "r1c3", "r2c3", "r3c3"]);
2741 
2742     auto col1misFile = [[""], ["r2c1"], ["r3c1"]];
2743     testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "", "r2c1", "r3c1"],
2744                                           new MissingFieldPolicy(true, ""));  // Exclude missing
2745     testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "NA", "r2c1", "r3c1"],
2746                                           new MissingFieldPolicy(false, "NA"));  // Replace missing
2747 }
2748 
2749 /** MinOperator output the minimum value for the field. This is a numeric operator.
2750  *
2751  * This operator returns the original string without additional numeric formatting.
2752  * This can be useful when joining back to the original data. This is different than
2753  * numeric operators that perform calculations.
2754  */
2755 final class MinOperator : SingleFieldOperator
2756 {
2757     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2758     {
2759         super("min", fieldIndex, missingPolicy);
2760     }
2761 
2762     final override SingleFieldCalculator makeCalculator()
2763     {
2764         return new MinCalculator(fieldIndex);
2765     }
2766 
2767     final class MinCalculator : SingleFieldCalculator
2768     {
2769         private bool _isFirst = true;
2770         private double _value = double.nan;
2771         private string _originalString = "nan";  // Note: Cannot format floats at compile time (version 2.087)
2772 
2773         this(size_t fieldIndex)
2774         {
2775             super(fieldIndex);
2776         }
2777 
2778         final override MinOperator getOperator()
2779         {
2780             return this.outer;
2781         }
2782 
2783         final override void processNextField(const char[] nextField)
2784         {
2785             double fieldValue = nextField.to!double;
2786             if (_isFirst)
2787             {
2788                 _value = fieldValue;
2789                 _originalString = nextField.to!string;
2790                 _isFirst = false;
2791             }
2792             else if (fieldValue < _value)
2793             {
2794                 _value = fieldValue;
2795                 _originalString = nextField.to!string;
2796             }
2797         }
2798 
2799         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2800         {
2801             return _originalString;
2802         }
2803     }
2804 }
2805 
2806 unittest // MinOperator
2807 {
2808     auto col1File = [["10"], ["9.5"], ["11"]];
2809     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
2810     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
2811 
2812     testSingleFieldOperator!MinOperator(col1File, 0, "min", ["nan", "10", "9.5", "9.5"]);
2813     testSingleFieldOperator!MinOperator(col2File, 0, "min", ["nan", "20", "20", "20"]);
2814     testSingleFieldOperator!MinOperator(col2File, 1, "min", ["nan", "-30", "-30", "-31"]);
2815     testSingleFieldOperator!MinOperator(col3File, 0, "min", ["nan", "9009", "199", "199"]);
2816     testSingleFieldOperator!MinOperator(col3File, 1, "min", ["nan", "9", "0", "0"]);
2817     testSingleFieldOperator!MinOperator(col3File, 2, "min", ["nan", "-4.5", "-4.5", "-4.5"]);
2818 
2819     auto col1misFile = [[""], ["10"], ["-10"]];
2820     testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "nan", "10", "-10"],
2821                                           new MissingFieldPolicy(true, ""));  // Exclude missing
2822     testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "5", "5", "-10"],
2823                                           new MissingFieldPolicy(false, "5"));  // Replace missing
2824 }
2825 
2826 /** MaxOperator output the maximum value for the field. This is a numeric operator.
2827  *
2828  * This operator returns the original string without additional numeric formatting.
2829  * This can be useful when joining back to the original data. This is different than
2830  * numeric operators that perform calculations.
2831  */
2832 final class MaxOperator : SingleFieldOperator
2833 {
2834     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2835     {
2836         super("max", fieldIndex, missingPolicy);
2837     }
2838 
2839     final override SingleFieldCalculator makeCalculator()
2840     {
2841         return new MaxCalculator(fieldIndex);
2842     }
2843 
2844     final class MaxCalculator : SingleFieldCalculator
2845     {
2846         private bool _isFirst = true;
2847         private double _value = double.nan;
2848         private string _originalString = "nan";  // Note: Cannot format floats at compile time (version 2.087)
2849 
2850         this(size_t fieldIndex)
2851         {
2852             super(fieldIndex);
2853         }
2854 
2855         final override MaxOperator getOperator()
2856         {
2857             return this.outer;
2858         }
2859 
2860         final override void processNextField(const char[] nextField)
2861         {
2862             double fieldValue = nextField.to!double;
2863             if (_isFirst)
2864             {
2865                 _value = fieldValue;
2866                 _originalString = nextField.to!string;
2867                 _isFirst = false;
2868             }
2869             else if (fieldValue > _value)
2870             {
2871                 _value = fieldValue;
2872                 _originalString = nextField.to!string;
2873             }
2874         }
2875 
2876         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2877         {
2878             return _originalString;
2879         }
2880     }
2881 }
2882 
2883 unittest // MaxOperator
2884 {
2885     auto col1File = [["10"], ["9.5"], ["11"]];
2886     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
2887     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
2888 
2889     testSingleFieldOperator!MaxOperator(col1File, 0, "max", ["nan", "10", "10", "11"]);
2890     testSingleFieldOperator!MaxOperator(col2File, 0, "max", ["nan", "20", "21", "22"]);
2891     testSingleFieldOperator!MaxOperator(col2File, 1, "max", ["nan", "-30", "-29", "-29"]);
2892     testSingleFieldOperator!MaxOperator(col3File, 0, "max", ["nan", "9009", "9009", "9009"]);
2893     testSingleFieldOperator!MaxOperator(col3File, 1, "max", ["nan", "9", "9", "9"]);
2894     testSingleFieldOperator!MaxOperator(col3File, 2, "max", ["nan", "-4.5", "-0.5", "12"]);
2895 
2896     auto col1misFile = [[""], ["-10"], ["10"]];
2897     testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "nan", "-10", "10"],
2898                                           new MissingFieldPolicy(true, ""));  // Exclude missing
2899     testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "5", "5", "10"],
2900                                           new MissingFieldPolicy(false, "5"));  // Replace missing
2901 }
2902 
2903 /** RangeOperator outputs the difference between the minimum and maximum values.
2904  *
2905  * If there is a single value, or all values are the same, the range is zero. This is
2906  * a numeric operator.
2907  */
2908 final class RangeOperator : SingleFieldOperator
2909 {
2910     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2911     {
2912         super("range", fieldIndex, missingPolicy);
2913     }
2914 
2915     final override SingleFieldCalculator makeCalculator()
2916     {
2917         return new RangeCalculator(fieldIndex);
2918     }
2919 
2920     final class RangeCalculator : SingleFieldCalculator
2921     {
2922         private bool _isFirst = true;
2923         private double _minValue = 0.0;
2924         private double _maxValue = 0.0;
2925 
2926         this(size_t fieldIndex)
2927         {
2928             super(fieldIndex);
2929         }
2930 
2931         final override RangeOperator getOperator()
2932         {
2933             return this.outer;
2934         }
2935 
2936         final override void processNextField(const char[] nextField)
2937         {
2938             double fieldValue = nextField.to!double;
2939             if (_isFirst)
2940             {
2941                 _minValue = _maxValue = fieldValue;
2942                 _isFirst = false;
2943             }
2944             else if (fieldValue > _maxValue)
2945             {
2946                 _maxValue = fieldValue;
2947             }
2948             else if (fieldValue < _minValue)
2949             {
2950                 _minValue = fieldValue;
2951             }
2952         }
2953 
2954         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2955         {
2956             return printOptions.formatNumber(_maxValue - _minValue);
2957         }
2958     }
2959 }
2960 
2961 unittest // RangeOperator
2962 {
2963     auto col1File = [["10"], ["9.5"], ["11"]];
2964     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
2965     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
2966 
2967     testSingleFieldOperator!RangeOperator(col1File, 0, "range", ["0", "0", "0.5", "1.5"]);
2968     testSingleFieldOperator!RangeOperator(col2File, 0, "range", ["0", "0", "1", "2"]);
2969     testSingleFieldOperator!RangeOperator(col2File, 1, "range", ["0", "0", "1", "2"]);
2970     testSingleFieldOperator!RangeOperator(col3File, 0, "range", ["0", "0", "8810", "8810"]);
2971     testSingleFieldOperator!RangeOperator(col3File, 1, "range", ["0", "0", "9", "9"]);
2972     testSingleFieldOperator!RangeOperator(col3File, 2, "range", ["0", "0", "4", "16.5"]);
2973 
2974     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]];
2975     testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "0", "0", "0.5", "1.5"],
2976                                           new MissingFieldPolicy(true, ""));  // Exclude missing
2977     testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "4.5", "4.5", "4.5", "5.5"],
2978                                           new MissingFieldPolicy(false, "5.5"));  // Replace missing
2979 }
2980 
2981 /** SumOperator produces the sum of all the values. This is a numeric operator.
2982  */
2983 final class SumOperator : SingleFieldOperator
2984 {
2985     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2986     {
2987         super("sum", fieldIndex, missingPolicy);
2988     }
2989 
2990     final override SingleFieldCalculator makeCalculator()
2991     {
2992         return new SumCalculator(fieldIndex);
2993     }
2994 
2995     final class SumCalculator : SingleFieldCalculator
2996     {
2997         private double _total = 0.0;
2998 
2999         this(size_t fieldIndex)
3000         {
3001             super(fieldIndex);
3002         }
3003 
3004         final override SumOperator getOperator()
3005         {
3006             return this.outer;
3007         }
3008 
3009         final override void processNextField(const char[] nextField)
3010         {
3011             _total += nextField.to!double;
3012         }
3013 
3014         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3015         {
3016             return printOptions.formatNumber(_total);
3017         }
3018     }
3019 }
3020 
3021 unittest // SumOperator
3022 {
3023     auto col1File = [["10"], ["9.5"], ["11"]];
3024     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3025     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
3026 
3027     testSingleFieldOperator!SumOperator(col1File, 0, "sum", ["0", "10", "19.5", "30.5"]);
3028     testSingleFieldOperator!SumOperator(col2File, 0, "sum", ["0", "20", "41", "63"]);
3029     testSingleFieldOperator!SumOperator(col2File, 1, "sum", ["0", "-30", "-59", "-90"]);
3030     testSingleFieldOperator!SumOperator(col3File, 0, "sum", ["0", "9009", "9208", "12211"]);
3031     testSingleFieldOperator!SumOperator(col3File, 1, "sum", ["0", "9", "9", "9.2"]);
3032     testSingleFieldOperator!SumOperator(col3File, 2, "sum", ["0", "-4.5", "-5", "7"]);
3033 
3034     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]];
3035     testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "0", "10", "10", "19.5", "30.5"],
3036                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3037     testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "1.5", "11.5", "13", "22.5", "33.5"],
3038                                           new MissingFieldPolicy(false, "1.5"));  // Replace missing
3039 }
3040 
3041 /** MeanOperator produces the mean (average) of all the values. This is a numeric operator.
3042  */
3043 final class MeanOperator : SingleFieldOperator
3044 {
3045     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3046     {
3047         super("mean", fieldIndex, missingPolicy);
3048     }
3049 
3050     final override SingleFieldCalculator makeCalculator()
3051     {
3052         return new MeanCalculator(fieldIndex);
3053     }
3054 
3055     final class MeanCalculator : SingleFieldCalculator
3056     {
3057         private double _total = 0.0;
3058         private size_t _count = 0;
3059 
3060         this(size_t fieldIndex)
3061         {
3062             super(fieldIndex);
3063         }
3064 
3065         final override MeanOperator getOperator()
3066         {
3067             return this.outer;
3068         }
3069 
3070         final override void processNextField(const char[] nextField)
3071         {
3072             _total += nextField.to!double;
3073             _count++;
3074         }
3075 
3076         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3077         {
3078             return printOptions.formatNumber(
3079                 (_count > 0) ? (_total / _count.to!double) : double.nan);
3080         }
3081     }
3082 }
3083 
3084 unittest // MeanOperator
3085 {
3086     auto col1File = [["10"], ["9.5"], ["7.5"]];
3087     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3088     auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]];
3089 
3090     testSingleFieldOperator!MeanOperator(col1File, 0, "mean", ["nan", "10", "9.75", "9"]);
3091     testSingleFieldOperator!MeanOperator(col2File, 0, "mean", ["nan", "20", "20.5", "21"]);
3092     testSingleFieldOperator!MeanOperator(col2File, 1, "mean", ["nan", "-30", "-29.5", "-30"]);
3093     testSingleFieldOperator!MeanOperator(col3File, 0, "mean", ["nan", "9009", "4509", "4509"]);
3094     testSingleFieldOperator!MeanOperator(col3File, 1, "mean", ["nan", "9", "4.5", "2"]);
3095     testSingleFieldOperator!MeanOperator(col3File, 2, "mean", ["nan", "-4.5", "-3", "2"]);
3096 
3097     auto col1misFile = [[""], ["6"], [""], ["14"], ["40"]];
3098     testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "nan", "6", "6", "10", "20"],
3099                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3100     testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "0", "3", "2", "5", "12"],
3101                                           new MissingFieldPolicy(false, "0"));  // Replace missing
3102 }
3103 
3104 /** MedianOperator produces the median of all the values. This is a numeric operator.
3105  *
3106  * All the field values are stored in memory as part of this calculation. This is
3107  * handled by unique key value lists.
3108  */
3109 final class MedianOperator : SingleFieldOperator
3110 {
3111     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3112     {
3113         super("median", fieldIndex, missingPolicy);
3114         setSaveFieldValuesNumeric();
3115     }
3116 
3117     final override SingleFieldCalculator makeCalculator()
3118     {
3119         return new MedianCalculator(fieldIndex);
3120     }
3121 
3122     final class MedianCalculator : SingleFieldCalculator
3123     {
3124         this(size_t fieldIndex)
3125         {
3126             super(fieldIndex);
3127         }
3128 
3129         final override MedianOperator getOperator()
3130         {
3131             return this.outer;
3132         }
3133 
3134         /* Work is done by saving the field values. */
3135         final override void processNextField(const char[] nextField)
3136         { }
3137 
3138         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3139         {
3140             return printOptions.formatNumber(valuesLists.numericValuesMedian(fieldIndex));
3141         }
3142     }
3143 }
3144 
3145 unittest // MedianOperator
3146 {
3147     auto col1File = [["10"], ["9.5"], ["7.5"]];
3148     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3149     auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]];
3150 
3151     testSingleFieldOperator!MedianOperator(col1File, 0, "median", ["nan", "10", "9.75", "9.5"]);
3152     testSingleFieldOperator!MedianOperator(col2File, 0, "median", ["nan", "20", "20.5", "21"]);
3153     testSingleFieldOperator!MedianOperator(col2File, 1, "median", ["nan", "-30", "-29.5", "-30"]);
3154     testSingleFieldOperator!MedianOperator(col3File, 0, "median", ["nan", "9009", "4509", "4509"]);
3155     testSingleFieldOperator!MedianOperator(col3File, 1, "median", ["nan", "9", "4.5", "0"]);
3156     testSingleFieldOperator!MedianOperator(col3File, 2, "median", ["nan", "-4.5", "-3", "-1.5"]);
3157 
3158     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]];
3159     testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "nan", "10", "10", "9.75", "9.5"],
3160                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3161     testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "0", "5", "0", "4.75", "7.5"],
3162                                           new MissingFieldPolicy(false, "0"));  // Replace missing
3163 }
3164 
3165 /** QuantileOperator produces the value representing the data at a cummulative probability.
3166  * This is a numeric operation.
3167  *
3168  * As an example, quantiles might be produced for the 0.25, 0.5, and 0.75 probabilities
3169  * (alternately, the 25th, 50th, and 75th percentile ranks, the 50th percentile being the
3170  * median). Data is sorted is ascending order. This operator takes one percentile, but it
3171  * is common to generate multiple quantile ranks for the same field when summarizing.
3172  *
3173  * All the field's values are stored in memory as part of this calculation. This is
3174  * handled by unique key value lists.
3175  */
3176 final class QuantileOperator : SingleFieldOperator
3177 {
3178     private double _prob;
3179 
3180     this(size_t fieldIndex, MissingFieldPolicy missingPolicy, double probability)
3181     {
3182         assert(0.0 <= probability && probability <= 1.0);
3183         import std.format : format;
3184 
3185         string header = (probability == 0.0) ? "pct0" : format("pct%02g", probability * 100.0);
3186         super(header, fieldIndex, missingPolicy);
3187         _prob = probability;
3188         setSaveFieldValuesNumeric();
3189     }
3190 
3191     final override SingleFieldCalculator makeCalculator()
3192     {
3193         return new QuantileCalculator(fieldIndex);
3194     }
3195 
3196     final class QuantileCalculator : SingleFieldCalculator
3197     {
3198         this(size_t fieldIndex)
3199         {
3200             super(fieldIndex);
3201         }
3202 
3203         final override QuantileOperator getOperator()
3204         {
3205             return this.outer;
3206         }
3207 
3208         /* Work is done by saving the field values. */
3209         final override void processNextField(const char[] nextField)
3210         { }
3211 
3212         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3213         {
3214             import tsv_utils.common.numerics : quantile;
3215             return printOptions.formatNumber(
3216                 quantile(this.outer._prob, valuesLists.numericValuesSorted(fieldIndex)));
3217         }
3218     }
3219 }
3220 
3221 unittest // QuantileOperator
3222 {
3223     auto col1File = [["10"], ["9.5"], ["7.5"]];
3224     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3225     auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]];
3226 
3227     auto defaultMissing = new MissingFieldPolicy;
3228 
3229     /* Same as the median tests. */
3230     testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct50", ["nan", "10", "9.75", "9.5"], defaultMissing, 0.50);
3231     testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct50", ["nan", "20", "20.5", "21"], defaultMissing, 0.50);
3232     testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct50", ["nan", "-30", "-29.5", "-30"], defaultMissing, 0.50);
3233     testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct50", ["nan", "9009", "4509", "4509"], defaultMissing, 0.50);
3234     testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct50", ["nan", "9", "4.5", "0"], defaultMissing, 0.50);
3235     testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct50", ["nan", "-4.5", "-3", "-1.5"], defaultMissing, 0.50);
3236 
3237     /* The extremes (0, 1), are min and max. */
3238     testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct0", ["nan", "10", "9.5", "7.5"], defaultMissing, 0.0);
3239     testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct0", ["nan", "20", "20", "20"], defaultMissing, 0.0);
3240     testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct0", ["nan", "-30", "-30", "-31"], defaultMissing, 0.0);
3241     testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct0", ["nan", "9009", "9", "9"], defaultMissing, 0.0);
3242     testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct0", ["nan", "9", "0", "-3"], defaultMissing, 0.0);
3243     testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct0", ["nan", "-4.5", "-4.5", "-4.5"], defaultMissing, 0.0);
3244 
3245     testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct100", ["nan", "10", "10", "10"], defaultMissing, 1.0);
3246     testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct100", ["nan", "20", "21", "22"], defaultMissing, 1.0);
3247     testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct100", ["nan", "-30", "-29", "-29"], defaultMissing, 1.0);
3248     testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct100", ["nan", "9009", "9009", "9009"], defaultMissing, 1.0);
3249     testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct100", ["nan", "9", "9", "9"], defaultMissing, 1.0);
3250     testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct100", ["nan", "-4.5", "-1.5", "12"], defaultMissing, 1.0);
3251 
3252     /* For missing policies, re-use the median tests. */
3253     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]];
3254     testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "nan", "10", "10", "9.75", "9.5"],
3255                                                  new MissingFieldPolicy(true, ""), 0.5);  // Exclude missing
3256     testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "0", "5", "0", "4.75", "7.5"],
3257                                                  new MissingFieldPolicy(false, "0"), 0.5);  // Replace missing
3258 }
3259 
3260 /** MadOperator produces the median absolute deviation from the median. This is a numeric
3261  * operation.
3262  *
3263  * The result is the raw MAD value, without a normalization applied.
3264  *
3265  * All the field values are stored in memory as part of this calculation. This is
3266  * handled by unique key value lists.
3267  */
3268 final class MadOperator : SingleFieldOperator
3269 {
3270     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3271     {
3272         super("mad", fieldIndex, missingPolicy);
3273         setSaveFieldValuesNumeric();
3274     }
3275 
3276     final override SingleFieldCalculator makeCalculator()
3277     {
3278         return new MadCalculator(fieldIndex);
3279     }
3280 
3281     final class MadCalculator : SingleFieldCalculator
3282     {
3283         this(size_t fieldIndex)
3284         {
3285             super(fieldIndex);
3286         }
3287 
3288         final override MadOperator getOperator()
3289         {
3290             return this.outer;
3291         }
3292 
3293         /* Work is done by saving the field values. */
3294         final override void processNextField(const char[] nextField)
3295         { }
3296 
3297         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3298         {
3299             import std.math : abs;
3300             import tsv_utils.common.numerics : rangeMedian;
3301 
3302             auto median = valuesLists.numericValuesMedian(fieldIndex);
3303             auto values = valuesLists.numericValues(fieldIndex);
3304             auto medianDevs = new double[values.length];
3305             foreach (size_t i, double v; values)
3306                 medianDevs[i] = abs(v - median);
3307 
3308             return printOptions.formatNumber(medianDevs.rangeMedian);
3309         }
3310     }
3311 }
3312 
3313 unittest // MadOperator
3314 {
3315     auto col1File = [["10"], ["15"], ["20"], ["25"], ["30"]];
3316     auto col2File = [["2", "50"], ["2", "51"], ["2", "52"]];
3317     auto col3File = [["16", "8", "-4"], ["8", "8", "-2"], ["8", "16", "0"]];
3318 
3319     testSingleFieldOperator!MadOperator(col1File, 0, "mad", ["nan", "0", "2.5", "5", "5", "5"]);
3320     testSingleFieldOperator!MadOperator(col2File, 0, "mad", ["nan", "0", "0", "0"]);
3321     testSingleFieldOperator!MadOperator(col2File, 1, "mad", ["nan", "0", "0.5", "1"]);
3322     testSingleFieldOperator!MadOperator(col3File, 0, "mad", ["nan", "0", "4", "0"]);
3323     testSingleFieldOperator!MadOperator(col3File, 1, "mad", ["nan", "0", "0", "0"]);
3324     testSingleFieldOperator!MadOperator(col3File, 2, "mad", ["nan", "0", "1", "2"]);
3325 
3326     auto col1misFile = [[""], ["16"], [""], ["32"], ["-4"]];
3327     testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "nan", "0", "0", "8", "16"],
3328                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3329     testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "0", "8", "0", "8", "4"],
3330                                           new MissingFieldPolicy(false, "0"));  // Replace missing
3331 }
3332 
3333 /** Generates the variance of the fields values. This is a numeric operator.
3334  */
3335 final class VarianceOperator : SingleFieldOperator
3336 {
3337     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3338     {
3339         super("var", fieldIndex, missingPolicy);
3340     }
3341 
3342     final override SingleFieldCalculator makeCalculator()
3343     {
3344         return new VarianceCalculator(fieldIndex);
3345     }
3346 
3347     final class VarianceCalculator : SingleFieldCalculator
3348     {
3349         private double _count = 0.0;
3350         private double _mean = 0.0;
3351         private double _m2 = 0.0;     // Sum of squares of differences from current mean
3352 
3353         this(size_t fieldIndex)
3354         {
3355             super(fieldIndex);
3356         }
3357 
3358         final override VarianceOperator getOperator()
3359         {
3360             return this.outer;
3361         }
3362 
3363         final override void processNextField(const char[] nextField)
3364         {
3365             _count += 1.0;
3366             double fieldValue = nextField.to!double;
3367             double delta = fieldValue - _mean;
3368             _mean += delta / _count;
3369             _m2 += delta * (fieldValue - _mean);
3370         }
3371 
3372         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3373         {
3374             return printOptions.formatNumber(
3375                 (_count >= 2.0) ? (_m2 / (_count - 1.0)) : double.nan);
3376         }
3377     }
3378 }
3379 
3380 unittest // VarianceOperator
3381 {
3382     auto col1File = [["5"], ["10"], ["15"]];
3383     auto col2File = [["-5", "-5"], ["-10", "0"], ["-15", "5"]];
3384     auto col3File = [["1", "2", "100"], ["2", "3", "100"], ["3", "4", "103"]];
3385 
3386     testSingleFieldOperator!VarianceOperator(col1File, 0, "var", ["nan", "nan", "12.5", "25"]);
3387     testSingleFieldOperator!VarianceOperator(col2File, 0, "var", ["nan", "nan", "12.5", "25"]);
3388     testSingleFieldOperator!VarianceOperator(col2File, 1, "var", ["nan", "nan", "12.5", "25"]);
3389     testSingleFieldOperator!VarianceOperator(col3File, 0, "var", ["nan", "nan", "0.5", "1"]);
3390     testSingleFieldOperator!VarianceOperator(col3File, 1, "var", ["nan", "nan", "0.5", "1"]);
3391     testSingleFieldOperator!VarianceOperator(col3File, 2, "var", ["nan", "nan", "0", "3"]);
3392 
3393     auto col1misFile = [["5"], ["10"], [""]];
3394     testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "12.5"],
3395                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3396     testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "25"],
3397                                           new MissingFieldPolicy(false, "15"));  // Replace missing
3398 }
3399 
3400 /** Generates the standard deviation of the fields values. This is a numeric operator.
3401  */
3402 final class StDevOperator : SingleFieldOperator
3403 {
3404     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3405     {
3406         super("stdev", fieldIndex, missingPolicy);
3407     }
3408 
3409     final override SingleFieldCalculator makeCalculator()
3410     {
3411         return new StDevCalculator(fieldIndex);
3412     }
3413 
3414     final class StDevCalculator : SingleFieldCalculator
3415     {
3416         private double _count = 0.0;
3417         private double _mean = 0.0;
3418         private double _m2 = 0.0;     // Sum of squares of differences from current mean
3419 
3420         this(size_t fieldIndex)
3421         {
3422             super(fieldIndex);
3423         }
3424 
3425         final override StDevOperator getOperator()
3426         {
3427             return this.outer;
3428         }
3429 
3430         final override void processNextField(const char[] nextField)
3431         {
3432             _count += 1.0;
3433             double fieldValue = nextField.to!double;
3434             double delta = fieldValue - _mean;
3435             _mean += delta / _count;
3436             _m2 += delta * (fieldValue - _mean);
3437         }
3438 
3439         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3440         {
3441             import std.math : sqrt;
3442             return printOptions.formatNumber(
3443                 (_count >= 2.0) ? (_m2 / (_count - 1.0)).sqrt : double.nan);
3444         }
3445     }
3446 }
3447 
3448 /* StDevOperator unit tests - These would be improved with a tolerance option.
3449  */
3450 unittest
3451 {
3452     auto col1File = [["1"], ["4"], ["7"]];
3453     auto col2File = [["3", "3"], ["3", "9"], ["7", "15"]];
3454     auto col3File = [["11", "10", "10"], ["24", "22", "25"], ["37", "34", "40"]];
3455 
3456     testSingleFieldOperator!StDevOperator(col1File, 0, "stdev", ["nan", "nan", "2.12132034356", "3"]);
3457     testSingleFieldOperator!StDevOperator(col2File, 0, "stdev", ["nan", "nan", "0", "2.30940107676"]);
3458     testSingleFieldOperator!StDevOperator(col2File, 1, "stdev", ["nan", "nan", "4.24264068712", "6"]);
3459     testSingleFieldOperator!StDevOperator(col3File, 0, "stdev", ["nan", "nan", "9.19238815543", "13"]);
3460     testSingleFieldOperator!StDevOperator(col3File, 1, "stdev", ["nan", "nan", "8.48528137424", "12"]);
3461     testSingleFieldOperator!StDevOperator(col3File, 2, "stdev", ["nan", "nan", "10.6066017178", "15"]);
3462 
3463     auto col1misFile = [["1"], ["4"], [""]];
3464     testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "2.12132034356"],
3465                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3466     testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "3"],
3467                                           new MissingFieldPolicy(false, "7"));  // Replace missing
3468 }
3469 
3470 /** UniqueCountOperator generates the number of unique values. Unique values are
3471  * based on exact text match calculation, not a numeric comparison.
3472  *
3473  * All the unique field values are stored in memory as part of this calculation.
3474  */
3475 final class UniqueCountOperator : SingleFieldOperator
3476 {
3477     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3478     {
3479         super("unique_count", fieldIndex, missingPolicy);
3480     }
3481 
3482     final override SingleFieldCalculator makeCalculator()
3483     {
3484         return new UniqueCountCalculator(fieldIndex);
3485     }
3486 
3487     final class UniqueCountCalculator : SingleFieldCalculator
3488     {
3489         private bool[string] _values;
3490 
3491         this(size_t fieldIndex)
3492         {
3493             super(fieldIndex);
3494         }
3495 
3496         final override UniqueCountOperator getOperator()
3497         {
3498             return this.outer;
3499         }
3500 
3501         final override void processNextField(const char[] nextField)
3502         {
3503             if (nextField !in _values) _values[nextField.to!string] = true;
3504         }
3505 
3506         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3507         {
3508             return printOptions.formatNumber(_values.length);
3509         }
3510     }
3511 }
3512 
3513 unittest // UniqueCount
3514 {
3515     auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]];
3516     auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]];
3517     auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]];
3518 
3519     testSingleFieldOperator!UniqueCountOperator(col1File, 0, "unique_count", ["0", "1", "2", "3", "3", "3", "3", "3", "4"]);
3520     testSingleFieldOperator!UniqueCountOperator(col2File, 0, "unique_count", ["0", "1", "2", "2"]);
3521     testSingleFieldOperator!UniqueCountOperator(col2File, 1, "unique_count", ["0", "1", "1", "2"]);
3522     testSingleFieldOperator!UniqueCountOperator(col3File, 0, "unique_count", ["0", "1", "2", "3"]);
3523     testSingleFieldOperator!UniqueCountOperator(col3File, 1, "unique_count", ["0", "1", "2", "2"]);
3524     testSingleFieldOperator!UniqueCountOperator(col3File, 2, "unique_count", ["0", "1", "2", "3"]);
3525 
3526     auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]];
3527     testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "0", "1", "1", "2", "3", "3", "3", "3", "3", "4"],
3528                                                 new MissingFieldPolicy(true, ""));  // Exclude missing
3529 
3530 
3531     testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "1", "2", "2", "3", "4", "4", "4", "4", "4", "5"],
3532                                                 new MissingFieldPolicy(false, "XYZ"));  // Replace missing
3533 }
3534 
3535 /** MissingCountOperator generates the number of missing values. This overrides
3536  * the global missingFieldsPolicy.
3537  */
3538 final class MissingCountOperator : SingleFieldOperator
3539 {
3540     private MissingFieldPolicy _globalMissingPolicy;
3541 
3542     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3543     {
3544         _globalMissingPolicy = missingPolicy;
3545         super("missing_count", fieldIndex, new MissingFieldPolicy(false, ""));
3546     }
3547 
3548     final override SingleFieldCalculator makeCalculator()
3549     {
3550         return new MissingCountCalculator(fieldIndex);
3551     }
3552 
3553     final class MissingCountCalculator : SingleFieldCalculator
3554     {
3555         private size_t _missingCount = 0;
3556 
3557         this(size_t fieldIndex)
3558         {
3559             super(fieldIndex);
3560         }
3561 
3562         final override MissingCountOperator getOperator()
3563         {
3564             return this.outer;
3565         }
3566 
3567         final override void processNextField(const char[] nextField)
3568         {
3569             if (this.outer._globalMissingPolicy.isMissingField(nextField)) _missingCount++;
3570         }
3571 
3572         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3573         {
3574             return printOptions.formatNumber(_missingCount);
3575         }
3576     }
3577 }
3578 
3579 unittest // MissingCount
3580 {
3581     auto col1File = [["a"], ["b"], [""], [" "], [""]];
3582     auto col2File = [["abc", ""], ["", ""], ["def", ""]];
3583     auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]];
3584 
3585     testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"]);
3586     testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"]);
3587     testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"]);
3588     testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"]);
3589     testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"]);
3590     testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"]);
3591 
3592     auto excludeMissing = new MissingFieldPolicy(true, "");
3593     auto replaceMissing = new MissingFieldPolicy(false, "X");
3594 
3595     testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], excludeMissing);
3596     testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], excludeMissing);
3597     testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], excludeMissing);
3598     testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], excludeMissing);
3599     testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], excludeMissing);
3600     testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], excludeMissing);
3601 
3602     testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], replaceMissing);
3603     testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], replaceMissing);
3604     testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], replaceMissing);
3605     testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], replaceMissing);
3606     testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], replaceMissing);
3607     testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], replaceMissing);
3608 }
3609 
3610 /** NotMissingCountOperator generates the number of not-missing values. This overrides
3611  * the global missingFieldsPolicy.
3612  */
3613 final class NotMissingCountOperator : SingleFieldOperator
3614 {
3615     private MissingFieldPolicy _globalMissingPolicy;
3616 
3617     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3618     {
3619         _globalMissingPolicy = missingPolicy;
3620         super("not_missing_count", fieldIndex, new MissingFieldPolicy(false, ""));
3621     }
3622 
3623     final override SingleFieldCalculator makeCalculator()
3624     {
3625         return new NotMissingCountCalculator(fieldIndex);
3626     }
3627 
3628     final class NotMissingCountCalculator : SingleFieldCalculator
3629     {
3630         private size_t _notMissingCount = 0;
3631 
3632         this(size_t fieldIndex)
3633         {
3634             super(fieldIndex);
3635         }
3636 
3637         final override NotMissingCountOperator getOperator()
3638         {
3639             return this.outer;
3640         }
3641 
3642         final override void processNextField(const char[] nextField)
3643         {
3644             if (!this.outer._globalMissingPolicy.isMissingField(nextField)) _notMissingCount++;
3645         }
3646 
3647         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3648         {
3649             return printOptions.formatNumber(_notMissingCount);
3650         }
3651     }
3652 }
3653 
3654 unittest // NotMissingCount
3655 {
3656     auto col1File = [["a"], ["b"], [""], [" "], [""]];
3657     auto col2File = [["abc", ""], ["", ""], ["def", ""]];
3658     auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]];
3659 
3660     testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"]);
3661     testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"]);
3662     testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"]);
3663     testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"]);
3664     testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"]);
3665     testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"]);
3666 
3667     auto excludeMissing = new MissingFieldPolicy(true, "");
3668     auto replaceMissing = new MissingFieldPolicy(false, "X");
3669 
3670     testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], excludeMissing);
3671     testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], excludeMissing);
3672     testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], excludeMissing);
3673     testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], excludeMissing);
3674     testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], excludeMissing);
3675     testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], excludeMissing);
3676 
3677     testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], replaceMissing);
3678     testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], replaceMissing);
3679     testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], replaceMissing);
3680     testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], replaceMissing);
3681     testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], replaceMissing);
3682     testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], replaceMissing);
3683 }
3684 
3685 /** ModeOperator outputs the most frequent value seen. In the event of a tie, the
3686  * first value seen is produced.
3687  *
3688  * All the field values are stored in memory as part of this calculation.
3689  *
3690  */
3691 final class ModeOperator : SingleFieldOperator
3692 {
3693     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3694     {
3695         super("mode", fieldIndex, missingPolicy);
3696     }
3697 
3698     final override SingleFieldCalculator makeCalculator()
3699     {
3700         return new ModeCalculator(fieldIndex);
3701     }
3702 
3703     final class ModeCalculator : SingleFieldCalculator
3704     {
3705         private size_t[string] _valueCounts;
3706         private Appender!(string[]) _uniqueValues;
3707 
3708         this(size_t fieldIndex)
3709         {
3710             super(fieldIndex);
3711         }
3712 
3713         final override ModeOperator getOperator()
3714         {
3715             return this.outer;
3716         }
3717 
3718         final override void processNextField(const char[] nextField)
3719         {
3720             auto countPtr = (nextField in _valueCounts);
3721 
3722             if (countPtr is null)
3723             {
3724                 string value = nextField.to!string;
3725                 _uniqueValues.put(value);
3726                 _valueCounts[value] = 1;
3727             }
3728             else
3729             {
3730                 (*countPtr)++;
3731             }
3732         }
3733 
3734         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3735         {
3736             string modeValue = "";
3737             size_t modeCount = 0;
3738 
3739             foreach (value; _uniqueValues.data)
3740             {
3741                 assert(value in _valueCounts);
3742 
3743                 auto count = _valueCounts[value];
3744 
3745                 if (count > modeCount)
3746                 {
3747                     modeValue = value;
3748                     modeCount = count;
3749                 }
3750             }
3751 
3752             return modeValue;
3753         }
3754     }
3755 }
3756 
3757 unittest // ModeOperator
3758 {
3759     auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]];
3760     auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]];
3761     auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]];
3762 
3763     testSingleFieldOperator!ModeOperator(col1File, 0, "mode", ["", "a", "a", "a", "c", "b", "b", "b"]);
3764     testSingleFieldOperator!ModeOperator(col2File, 0, "mode", ["", "abc", "abc", "def"]);
3765     testSingleFieldOperator!ModeOperator(col2File, 1, "mode", ["", "pqr", "pqr", "pqr"]);
3766     testSingleFieldOperator!ModeOperator(col3File, 0, "mode", ["", "1.0", "1.0", "1.0"]);
3767     testSingleFieldOperator!ModeOperator(col3File, 1, "mode", ["", "1", "1", "a"]);
3768     testSingleFieldOperator!ModeOperator(col3File, 2, "mode", ["", "a", "a", "a"]);
3769 
3770     auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]];
3771     testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "", "a", "a", "a", "a", "c", "b", "b"],
3772                                          new MissingFieldPolicy(true, ""));  // Exclude missing
3773 
3774 
3775     testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "X", "X", "X", "X", "X", "X", "X", "b"],
3776                                          new MissingFieldPolicy(false, "X"));  // Replace missing
3777 }
3778 
3779 /** ModeCountOperator outputs the count of the most frequent value seen.
3780  *
3781  * All the field values are stored in memory as part of this calculation.
3782  *
3783  */
3784 final class ModeCountOperator : SingleFieldOperator
3785 {
3786     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3787     {
3788         super("mode_count", fieldIndex, missingPolicy);
3789     }
3790 
3791     final override SingleFieldCalculator makeCalculator()
3792     {
3793         return new ModeCountCalculator(fieldIndex);
3794     }
3795 
3796     final class ModeCountCalculator : SingleFieldCalculator
3797     {
3798         private size_t[string] _valueCounts;
3799 
3800         this(size_t fieldIndex)
3801         {
3802             super(fieldIndex);
3803         }
3804 
3805         final override ModeCountOperator getOperator()
3806         {
3807             return this.outer;
3808         }
3809 
3810         final override void processNextField(const char[] nextField)
3811         {
3812             auto countPtr = (nextField in _valueCounts);
3813 
3814             if (countPtr is null)
3815             {
3816                 string value = nextField.to!string;
3817                 _valueCounts[value] = 1;
3818             }
3819             else
3820             {
3821                 (*countPtr)++;
3822             }
3823         }
3824 
3825         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3826         {
3827             size_t modeCount = 0;
3828             foreach (count; _valueCounts.byValue) if (count > modeCount) modeCount = count;
3829             return printOptions.formatNumber(modeCount);
3830         }
3831     }
3832 }
3833 
3834 unittest // ModeCountOperator
3835 {
3836     auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]];
3837     auto col2File = [["abc", ""], ["def", ""], ["def", "xyz"]];
3838     auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]];
3839 
3840     testSingleFieldOperator!ModeCountOperator(col1File, 0, "mode_count", ["0", "1", "1", "1", "2", "2", "3", "3"]);
3841     testSingleFieldOperator!ModeCountOperator(col2File, 0, "mode_count", ["0", "1", "1", "2"]);
3842     testSingleFieldOperator!ModeCountOperator(col2File, 1, "mode_count", ["0", "1", "2", "2"]);
3843     testSingleFieldOperator!ModeCountOperator(col3File, 0, "mode_count", ["0", "1", "1", "1"]);
3844     testSingleFieldOperator!ModeCountOperator(col3File, 1, "mode_count", ["0", "1", "1", "2"]);
3845     testSingleFieldOperator!ModeCountOperator(col3File, 2, "mode_count", ["0", "1", "1", "1"]);
3846 
3847     auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]];
3848     testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "0", "1", "1", "1", "1", "2", "2", "3"],
3849                                               new MissingFieldPolicy(true, ""));  // Exclude missing
3850 
3851 
3852     testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "1", "1", "2", "2", "2", "2", "2", "3"],
3853                                               new MissingFieldPolicy(false, "X"));  // Replace missing
3854 }
3855 
3856 /** ValuesOperator outputs each value delimited by an alternate delimiter character.
3857  *
3858  * All the field values are stored in memory as part of this calculation. This is
3859  * handled by unique key value lists.
3860  */
3861 
3862 final class ValuesOperator : SingleFieldOperator
3863 {
3864     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3865     {
3866         super("values", fieldIndex, missingPolicy);
3867         setSaveFieldValuesText();
3868     }
3869 
3870     final override SingleFieldCalculator makeCalculator()
3871     {
3872         return new ValuesCalculator(fieldIndex);
3873     }
3874 
3875     final class ValuesCalculator : SingleFieldCalculator
3876     {
3877         this(size_t fieldIndex)
3878         {
3879             super(fieldIndex);
3880         }
3881 
3882         final override ValuesOperator getOperator()
3883         {
3884             return this.outer;
3885         }
3886 
3887         /* Work is done by saving the field values. */
3888         final override void processNextField(const char[] nextField)
3889         { }
3890 
3891         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3892         {
3893             return valuesLists.textValues(fieldIndex).join(printOptions.valuesDelimiter);
3894         }
3895     }
3896 }
3897 
3898 unittest // ValuesOperator
3899 {
3900     auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]];
3901     auto col2File = [["", "50"], ["", "51"], ["xyz", "52"]];
3902     auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "---"]];
3903 
3904     testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e|", "a||b|cd|e||a"]);
3905     testSingleFieldOperator!ValuesOperator(col2File, 0, "values", ["", "", "|", "||xyz"]);
3906     testSingleFieldOperator!ValuesOperator(col2File, 1, "values", ["", "50", "50|51", "50|51|52"]);
3907     testSingleFieldOperator!ValuesOperator(col3File, 0, "values", ["", "z", "z|y", "z|y|w"]);
3908     testSingleFieldOperator!ValuesOperator(col3File, 1, "values", ["", "a", "a|ab", "a|ab|ba"]);
3909     testSingleFieldOperator!ValuesOperator(col3File, 2, "values", ["", "-", "-|--", "-|--|---"]);
3910 
3911     testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e|a"],
3912                                          new MissingFieldPolicy(true, ""));  // Exclude missing
3913 
3914 
3915     testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e|X", "a|X|b|cd|e|X|a"],
3916                                          new MissingFieldPolicy(false, "X"));  // Replace missing
3917 }
3918 
3919 /** UniqueValuesOperator outputs each unique value delimited by an alternate delimiter
3920  * character. Values are output in the order seen.
3921  *
3922  * All unique field values are stored in memory as part of this calculation.
3923  *
3924  */
3925 final class UniqueValuesOperator : SingleFieldOperator
3926 {
3927     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3928     {
3929         super("unique_values", fieldIndex, missingPolicy);
3930     }
3931 
3932     final override SingleFieldCalculator makeCalculator()
3933     {
3934         return new UniqueValuesCalculator(fieldIndex);
3935     }
3936 
3937     final class UniqueValuesCalculator : SingleFieldCalculator
3938     {
3939         private size_t[string] _valuesHash;
3940         private Appender!(string[]) _uniqueValues;
3941 
3942         this(size_t fieldIndex)
3943         {
3944             super(fieldIndex);
3945         }
3946 
3947         final override UniqueValuesOperator getOperator()
3948         {
3949             return this.outer;
3950         }
3951 
3952         final override void processNextField(const char[] nextField)
3953         {
3954             auto ptr = (nextField in _valuesHash);
3955 
3956             if (ptr is null)
3957             {
3958                 string value = nextField.to!string;
3959                 _uniqueValues.put(value);
3960                 _valuesHash[value] = 1;
3961             }
3962         }
3963 
3964         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3965         {
3966             return _uniqueValues.data.join(printOptions.valuesDelimiter);
3967         }
3968     }
3969 }
3970 
3971 unittest // UniqueValuesOperator
3972 {
3973     auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]];
3974     auto col2File = [["", "50"], ["", "50"], ["xyz", "52"]];
3975     auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "-"]];
3976 
3977     testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e", "a||b|cd|e"]);
3978     testSingleFieldOperator!UniqueValuesOperator(col2File, 0, "unique_values", ["", "", "", "|xyz"]);
3979     testSingleFieldOperator!UniqueValuesOperator(col2File, 1, "unique_values", ["", "50", "50", "50|52"]);
3980     testSingleFieldOperator!UniqueValuesOperator(col3File, 0, "unique_values", ["", "z", "z|y", "z|y|w"]);
3981     testSingleFieldOperator!UniqueValuesOperator(col3File, 1, "unique_values", ["", "a", "a|ab", "a|ab|ba"]);
3982     testSingleFieldOperator!UniqueValuesOperator(col3File, 2, "unique_values", ["", "-", "-|--", "-|--"]);
3983 
3984     testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e"],
3985                                                  new MissingFieldPolicy(true, ""));  // Exclude missing
3986 
3987 
3988     testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e", "a|X|b|cd|e"],
3989                                                  new MissingFieldPolicy(false, "X"));  // Replace missing
3990 }