1 /**
2 Command line tool that reads TSV files and summarizes field values associated with
3 equivalent keys.
4 
5 Copyright (c) 2016-2019, eBay Software Foundation
6 Initially written by Jon Degenhardt
7 
8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
9 */
10 module tsv_utils.tsv_summarize;
11 
12 import std.algorithm : all, any, canFind, each, find, findSplit, map, joiner, splitter;
13 import std.array : join;
14 import std.conv : to;
15 import std.format : format;
16 import std.range;
17 import std.stdio;
18 import std.typecons : tuple;
19 import std.container : DList;
20 
21 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ];
22 
23 version(unittest)
24 {
25     // When running unit tests, use main from -main compiler switch.
26 }
27 else
28 {
29     int main(string[] cmdArgs)
30     {
31         /* When running in DMD code coverage mode, turn on report merging. */
32         version(D_Coverage) version(DigitalMars)
33         {
34             import core.runtime : dmd_coverSetMerge;
35             dmd_coverSetMerge(true);
36         }
37 
38         TsvSummarizeOptions cmdopt;
39         auto r = cmdopt.processArgs(cmdArgs);
40         if (!r[0]) return r[1];
41         version(LDC_Profile)
42         {
43             import ldc.profile : resetAll;
44             resetAll();
45         }
46         try tsvSummarize(cmdopt, cmdArgs[1..$]);
47         catch (Exception exc)
48         {
49             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
50             return 1;
51         }
52         return 0;
53     }
54 }
55 
56 auto helpTextVerbose = q"EOS
57 Synopsis: tsv-summarize [options] file [file...]
58 
59 tsv-summarize reads tabular data files (tab-separated by default), tracks
60 field values for each unique key, and runs summarization algorithms. Consider
61 the file data.tsv:
62 
63    make    color   time
64    ford    blue    131
65    chevy   green   124
66    ford    red     128
67    bmw     black   118
68    bmw     black   126
69    ford    blue    122
70 
71 The min and average times for each make is generated by the command:
72 
73    $ tsv-summarize --header --group-by 1 --min 3 --mean 3 data.tsv
74 
75 This produces:
76 
77    make   time_min time_mean
78    ford   122      127
79    chevy  124      124
80    bmw    118      122
81 
82 Using '--group 1,2' will group by both 'make' and 'color'. Omitting the
83 '--group-by' entirely summarizes fields for full file.
84 
85 The program tries to generate useful headers, but custom headers can be
86 specified. Example (using -g and -H shortcuts for --header and --group-by):
87 
88    $ tsv-summarize -H -g 1 --min 3:fastest --mean 3:average data.tsv
89 
90 Most operators take custom headers in a similarly way, generally following:
91 
92   --<operator-name> FIELD[:header]
93 
94 Operators can be specified multiple times. They can also take multiple
95 fields (though not when a custom header is specified). Examples:
96 
97   --median 2,3,4
98   --median 2-5,7-11
99 
100 The quantile operator requires one or more probabilities after the fields:
101 
102   --quantile 2:0.25                // Quantile 1 of field 2
103   --quantile 2-4:0.25,0.5,0.75     // Q1, Median, Q3 of fields 2, 3, 4
104 
105 Summarization operators available are:
106   count       range        mad            values
107   retain      sum          var            unique-values
108   first       mean         stddev         unique-count
109   last        median       mode           missing-count
110   min         quantile     mode-count     not-missing-count
111   max
112 
113 Numeric values are printed to 12 significant digits by default. This can be
114 changed using the '--p|float-precision' option. If six or less it sets the
115 number of significant digits after the decimal point. If greater than six it
116 sets the total number of significant digits.
117 
118 Calculations hold onto the minimum data needed while reading data. A few
119 operations like median keep all data values in memory. These operations will
120 start to encounter performance issues as available memory becomes scarce. The
121 size that can be handled effectively is machine dependent, but often quite
122 large files can be handled.
123 
124 Operations requiring numeric entries will signal an error and terminate
125 processing if a non-numeric entry is found.
126 
127 Missing values are not treated specially by default, this can be changed
128 using the '--x|exclude-missing' or '--r|replace-missing' option. The former
129 turns off processing for missing values, the latter uses a replacement value.
130 
131 Options:
132 EOS";
133 
134 auto helpText = q"EOS
135 Synopsis: tsv-summarize [options] file [file...]
136 
137 tsv-summarize runs aggregation operations on fields in tab-separated value
138 files. Operations can be run against the full input data or grouped by key
139 fields. Use --help-verbose for more extensive help.
140 
141 Options:
142 EOS";
143 
144 /** Command line options - Container and processing. The processArgs method is used to
145  * process the command line.
146  */
147 struct TsvSummarizeOptions {
148     string programName;
149 
150     /* Options set directly by on the command line.. */
151     size_t[] keyFields;                // -g, --group-by
152     bool hasHeader = false;            // --header
153     bool writeHeader = false;          // -w, --write-header
154     char inputFieldDelimiter = '\t';   // --d|delimiter
155     char valuesDelimiter = '|';        // --v|values-delimiter
156     size_t floatPrecision = 12;        // --p|float-precision
157     bool excludeMissing = false;       // --x|exclude-missing
158     string missingValueReplacement;    // --r|replace-missing
159     bool helpVerbose = false;          // --help-verbose
160     bool versionWanted = false;        // --V|version
161     DList!Operator operators;          // Operators, in the order specified.
162     size_t endFieldIndex = 0;          // Derived value. Max field index used plus one.
163     MissingFieldPolicy globalMissingPolicy = new MissingFieldPolicy;   // Derived value.
164 
165     /* Returns a tuple. First value is true if command line arguments were successfully
166      * processed and execution should continue, or false if an error occurred or the user
167      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
168      *
169      * Returning true (execution continues) means args have been validated and derived
170      * values calculated. In addition, field indices have been converted to zero-based.
171      */
172     auto processArgs (ref string[] cmdArgs) {
173         import std.algorithm : any, each;
174         import std.getopt;
175         import std.path : baseName, stripExtension;
176         import std.typecons : Yes, No;
177         import tsv_utils.common.getopt_inorder;
178         import tsv_utils.common.utils :  makeFieldListOptionHandler;
179 
180         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
181 
182         try
183         {
184             arraySep = ",";    // Use comma to separate values in command line options
185             auto r = getoptInorder(
186                 cmdArgs,
187                 "help-verbose",       "              Print full help.", &helpVerbose,
188 
189                 std.getopt.config.caseSensitive,
190                 "V|version",          "              Print version information and exit.", &versionWanted,
191                 std.getopt.config.caseInsensitive,
192 
193                 "g|group-by",         "<field-list>  Fields to use as key.",
194                 keyFields.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex),
195 
196                 std.getopt.config.caseSensitive,
197                 "H|header",           "              Treat the first line of each file as a header.", &hasHeader,
198                 std.getopt.config.caseInsensitive,
199 
200                 "w|write-header",     "              Write an output header even if there is no input header.", &writeHeader,
201                 "d|delimiter",        "CHR           Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &inputFieldDelimiter,
202                 "v|values-delimiter", "CHR           Values delimiter. Default: vertical bar (|). (Single byte UTF-8 characters only.)", &valuesDelimiter,
203                 "p|float-precision",  "NUM           'Precision' to use printing floating point numbers. Affects the number of digits printed and exponent use. Default: 12", &floatPrecision,
204                 "x|exclude-missing",  "              Exclude missing (empty) fields from calculations.", &excludeMissing,
205                 "r|replace-missing",  "STR           Replace missing (empty) fields with STR in calculations.", &missingValueReplacement,
206                 "count",              "              Count occurrences of each unique key.", &countOptionHandler,
207                 "count-header",       "STR           Count occurrences of each unique key, use header STR.", &countHeaderOptionHandler,
208                 "retain",             "<field-list>  Retain one copy of the field.", &operatorOptionHandler!RetainOperator,
209                 "first",              "<field-list>[:STR]  First value seen.", &operatorOptionHandler!FirstOperator,
210                 "last",               "<field-list>[:STR]  Last value seen.", &operatorOptionHandler!LastOperator,
211                 "min",                "<field-list>[:STR]  Min value. (Numeric fields only.)", &operatorOptionHandler!MinOperator,
212                 "max",                "<field-list>[:STR]  Max value. (Numeric fields only.)", &operatorOptionHandler!MaxOperator,
213                 "range",              "<field-list>[:STR]  Difference between min and max values. (Numeric fields only.)", &operatorOptionHandler!RangeOperator,
214                 "sum",                "<field-list>[:STR]  Sum of the values. (Numeric fields only.)", &operatorOptionHandler!SumOperator,
215                 "mean",               "<field-list>[:STR]  Mean (average). (Numeric fields only.)", &operatorOptionHandler!MeanOperator,
216                 "median",             "<field-list>[:STR]  Median value. (Numeric fields only. Reads all values into memory.)", &operatorOptionHandler!MedianOperator,
217                 "quantile",           "<field-list>:p[,p...][:STR]  Quantiles. One or more fields, then one or more 0.0-1.0 probabilities. (Numeric fields only. Reads all values into memory.)", &quantileOperatorOptionHandler,
218                 "mad",                "<field-list>[:STR]  Median absolute deviation from the median. Raw value, not scaled. (Numeric fields only. Reads all values into memory.)", &operatorOptionHandler!MadOperator,
219                 "var",                "<field-list>[:STR]  Variance. (Sample variance, numeric fields only).", &operatorOptionHandler!VarianceOperator,
220                 "stdev",              "<field-list>[:STR]  Standard deviation. (Sample st.dev, numeric fields only).", &operatorOptionHandler!StDevOperator,
221                 "mode",               "<field-list>[:STR]  Mode. The most frequent value. (Reads all unique values into memory.)", &operatorOptionHandler!ModeOperator,
222                 "mode-count",         "<field-list>[:STR]  Count of the most frequent value. (Reads all unique values into memory.)", &operatorOptionHandler!ModeCountOperator,
223                 "unique-count",       "<field-list>[:STR]  Number of unique values. (Reads all unique values into memory.)", &operatorOptionHandler!UniqueCountOperator,
224                 "missing-count",      "<field-list>[:STR]  Number of missing (empty) fields. Not affected by '--x|exclude-missing' or '--r|replace-missing'.", &operatorOptionHandler!MissingCountOperator,
225                 "not-missing-count",  "<field-list>[:STR]  Number of filled (non-empty) fields. Not affected by '--r|replace-missing'.", &operatorOptionHandler!NotMissingCountOperator,
226                 "values",             "<field-list>[:STR]  All the values, separated by --v|values-delimiter. (Reads all values into memory.)", &operatorOptionHandler!ValuesOperator,
227                 "unique-values",      "<field-list>[:STR]  All the unique values, separated by --v|values-delimiter. (Reads all unique values into memory.)", &operatorOptionHandler!UniqueValuesOperator,
228                 );
229 
230             if (r.helpWanted)
231             {
232                 defaultGetoptPrinter(helpText, r.options);
233                 return tuple(false, 0);
234             }
235             else if (helpVerbose)
236             {
237                 defaultGetoptPrinter(helpTextVerbose, r.options);
238                 return tuple(false, 0);
239             }
240             else if (versionWanted)
241             {
242                 import tsv_utils.common.tsvutils_version;
243                 writeln(tsvutilsVersionNotice("tsv-summarize"));
244                 return tuple(false, 0);
245             }
246 
247             consistencyValidations();
248             derivations();
249         }
250         catch (Exception exc)
251         {
252             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
253             return tuple(false, 1);
254         }
255         return tuple(true, 0);
256     }
257 
258     /* operationOptionHandler functions are callbacks that process command line options
259      * specifying summarization operations. eg. '--max 5', '--last 3:LastEntry'. Handlers
260      * check syntactic correctness and instantiate Operator objects that do the work. This
261      * is also where 1-upped field numbers are converted to 0-based indices.
262      */
263     private void operatorOptionHandler(OperatorClass : SingleFieldOperator)(string option, string optionVal)
264     {
265         import std.range : enumerate;
266         import std.typecons : Yes, No;
267         import tsv_utils.common.utils :  parseFieldList;
268 
269         auto valSplit = findSplit(optionVal, ":");
270 
271         if (valSplit[0].empty || (!valSplit[1].empty && valSplit[2].empty))
272         {
273             throw new Exception(
274                 format("Invalid option value: '--%s %s'. Expected: '--%s <field-list>' or '--%s <field>:<header>'.",
275                        option, optionVal, option, option));
276         }
277 
278         try foreach (fieldNum, fieldIndex;
279                      valSplit[0].to!string
280                      .parseFieldList!(size_t, Yes.convertToZeroBasedIndex).enumerate(1))
281             {
282                 auto op = new OperatorClass(fieldIndex, globalMissingPolicy);
283 
284                 if (!valSplit[2].empty) // Header specified
285                 {
286                     if (fieldNum > 1)
287                     {
288                         throw new Exception(
289                             format("Invalid option: '--%s %s'. Cannot specify a custom header when using multiple fields.",
290                                    option, optionVal));
291                     }
292                     else if (!op.allowCustomHeader)
293                     {
294                         throw new Exception(
295                             format("Invalid option: '--%s %s'. Operator does not support custom headers.",
296                                    option, optionVal));
297                     }
298 
299                     op.setCustomHeader(valSplit[2].to!string);
300                 }
301 
302                 operators.insertBack(op);
303                 if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1;
304             }
305         catch (Exception exc)
306         {
307             import std.format : format;
308             exc.msg = format("[--%s] %s", option, exc.msg);
309             throw exc;
310         }
311     }
312 
313     /* QuantileOperator has a different syntax and needs a custom command option handler. */
314     private void quantileOperatorOptionHandler(string option, string optionVal)
315     {
316         import std.typecons : Yes, No;
317         import tsv_utils.common.utils :  parseFieldList;
318 
319         auto formatErrorMsg(string option, string optionVal)
320         {
321             return format(
322                 "Invalid option value: '--%s %s'. Expected: '--%s <field-list>:<prob>[,<prob>]' or '--%s <field>:<prob>:<header>' where <prob> is a number between 0.0 and 1.0.",
323                 option, optionVal, option, option);
324         }
325 
326         auto split1 = findSplit(optionVal, ":");
327 
328         if (split1[0].empty || (!split1[1].empty && split1[2].empty))
329             throw new Exception(formatErrorMsg(option, optionVal));
330 
331         auto split2 = findSplit(split1[2], ":");
332 
333         if (split2[0].empty || (!split2[1].empty && split2[2].empty))
334             throw new Exception(formatErrorMsg(option, optionVal));
335 
336         auto fieldStr = split1[0];
337         auto probStr = split2[0];
338         auto header = split2[2];
339 
340         size_t[] fieldIndices;
341         double[] probs;
342 
343         try foreach (fieldIndex;
344                      fieldStr.to!string.parseFieldList!(size_t, Yes.convertToZeroBasedIndex))
345             {
346                 fieldIndices ~= fieldIndex;
347             }
348         catch (Exception exc)
349         {
350             import std.format : format;
351             exc.msg = format("[--%s] %s", option, exc.msg);
352             throw exc;
353         }
354 
355         foreach (str; probStr.splitter(','))
356         {
357             double p;
358 
359             try p = str.to!double;
360             catch (Exception exc)
361                 throw new Exception(formatErrorMsg(option, optionVal));
362 
363             if (!(p >= 0.0 && p <= 1.0))
364                 throw new Exception(
365                     format("Invalid option: '--%s %s'. Probability '%g' is not in the interval [0.0,1.0].",
366                            option, optionVal, p));
367 
368             probs ~= p;
369         }
370 
371         if (!header.empty && (fieldIndices.length > 1 || probs.length > 1))
372         {
373             throw new Exception(
374                 format("Invalid option: '--%s %s'. Cannot specify a custom header when using multiple fields or multiple probabilities.",
375                        option, optionVal));
376         }
377 
378         assert (fieldIndices.length > 0);
379         assert (probs.length > 0);
380         assert (header.empty || (fieldIndices.length == 1 && probs.length == 1));
381 
382         foreach (fieldIndex; fieldIndices)
383         {
384             foreach (p; probs)
385             {
386                 auto op = new QuantileOperator(fieldIndex, globalMissingPolicy, p);
387                 if (!header.empty) op.setCustomHeader(header);
388                 operators.insertBack(op);
389             }
390             if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1;
391         }
392     }
393 
394     private void countOptionHandler()
395     {
396         operators.insertBack(new CountOperator());
397     }
398 
399     private void countHeaderOptionHandler(string option, string optionVal)
400     {
401         auto op = new CountOperator();
402         op.setCustomHeader(optionVal);
403         operators.insertBack(op);
404     }
405 
406     /* This routine does validations not handled by processArgs. */
407     private void consistencyValidations()
408     {
409         if (operators.empty)
410         {
411             throw new Exception("At least one summary operator is required.");
412         }
413 
414         if (inputFieldDelimiter == valuesDelimiter)
415         {
416             throw new Exception("Cannot use the same character for both --d|field-delimiter and --v|values-delimiter.");
417         }
418 
419         if (excludeMissing && missingValueReplacement.length != 0)
420         {
421             throw new Exception("Cannot use both '--x|exclude-missing' and '--r|replace-missing'.");
422         }
423     }
424 
425     /* Post-processing derivations. */
426     void derivations()
427     {
428         /* keyFields need to part of the endFieldIndex, which is one past the last field index. */
429         keyFields.each!(delegate (size_t x) { if (x >= endFieldIndex) endFieldIndex = x + 1; } );
430 
431         /* Missing field policy. */
432         globalMissingPolicy.updatePolicy(excludeMissing, missingValueReplacement);
433     }
434 }
435 
436 /** tsvSummarize does the primary work of the tsv-summarize program.
437  */
438 void tsvSummarize(TsvSummarizeOptions cmdopt, in string[] inputFiles)
439 {
440     import tsv_utils.common.utils : bufferedByLine, throwIfWindowsNewlineOnUnix;
441 
442     /* Pick the Summarizer based on the number of key-fields entered. */
443     auto summarizer =
444         (cmdopt.keyFields.length == 0)
445         ? new NoKeySummarizer!(typeof(stdout.lockingTextWriter()))(
446             cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
447 
448         : (cmdopt.keyFields.length == 1)
449         ? new OneKeySummarizer!(typeof(stdout.lockingTextWriter()))(
450             cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
451 
452         : new MultiKeySummarizer!(typeof(stdout.lockingTextWriter()))(
453             cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy);
454 
455     /* Add the operators to the Summarizer. */
456     summarizer.setOperators(inputRangeObject(cmdopt.operators[]));
457 
458     /* Process each input file, one line at a time. */
459     auto lineFields = new char[][](cmdopt.endFieldIndex);
460     bool headerFound = false;
461     foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"])
462     {
463         auto inputStream = (filename == "-") ? stdin : filename.File();
464         foreach (lineNum, line; inputStream.bufferedByLine.enumerate(1))
465         {
466             if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, lineNum);
467 
468             /* Copy the needed number of fields to the fields array.
469              * Note: The number is zero if no operator needs fields. Notably, the count
470              * operator. Used by itself, it counts the number input lines (ala 'wc -l').
471              */
472             if (cmdopt.endFieldIndex > 0)
473             {
474                 size_t fieldIndex = 0;
475                 foreach (fieldValue; line.splitter(cmdopt.inputFieldDelimiter))
476                 {
477                     if (fieldIndex == cmdopt.endFieldIndex) break;
478                     lineFields[fieldIndex] = fieldValue;
479                     fieldIndex++;
480                 }
481 
482                 if (fieldIndex == 0)
483                 {
484                     assert(cmdopt.endFieldIndex > 0);
485                     assert(line.length == 0);
486 
487                     /* Bug work-around. Empty lines are not handled properly by splitter.
488                      *   - Bug: https://issues.dlang.org/show_bug.cgi?id=15735
489                      *   - Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030
490                      * This can arise for: '$ tsv-summarize -g 1 --count'. This counts the
491                      * unique values in field 1. If there's only one column, then an empty
492                      * line becomes an empty string for field 1. Work-around: Point to the
493                      * line. It's an empty string.
494                      */
495                     lineFields[fieldIndex] = line;
496                     fieldIndex++;
497                 }
498 
499                 if (fieldIndex < cmdopt.endFieldIndex)
500                 {
501                     throw new Exception(
502                         format("Not enough fields in line. File: %s, Line: %s",
503                                (filename == "-") ? "Standard Input" : filename, lineNum));
504                 }
505             }
506 
507             if (cmdopt.hasHeader && lineNum == 1)
508             {
509                 if (!headerFound)
510                 {
511                     summarizer.processHeaderLine(lineFields);
512                     headerFound = true;
513                 }
514             }
515             else
516             {
517                 /* Process the line. Processing will fail (throw) if a field cannot be
518                  * converted to the expected type.
519                  */
520                 try summarizer.processNextLine(lineFields);
521                 catch (Exception exc)
522                 {
523                     throw new Exception(
524                         format("Could not process line or field: %s\n  File: %s Line: %s%s",
525                                exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum,
526                                (lineNum == 1) ? "\n  Is this a header line? Use --header to skip." : ""));
527                 }
528             }
529         }
530     }
531 
532     debug writeln("[tsvSummarize] After reading all data.");
533 
534     /* Whew! We're done processing input data. Run the calculations and print. */
535     auto printOptions = SummarizerPrintOptions(
536         cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision);
537     auto stdoutWriter = stdout.lockingTextWriter;
538 
539     if (cmdopt.hasHeader || cmdopt.writeHeader)
540     {
541         summarizer.writeSummaryHeader(stdoutWriter, printOptions);
542     }
543 
544     summarizer.writeSummaryBody(stdoutWriter, printOptions);
545 }
546 
547 /** The default field header. This is used when the input doesn't have field headers,
548  * but field headers are used in the output. The default is "fieldN", where N is the
549  * 1-upped field number.
550  */
551 string fieldHeaderFromIndex(size_t fieldIndex)
552 {
553     enum prefix = "field";
554     return prefix ~ (fieldIndex + 1).to!string;
555 }
556 
557 unittest
558 {
559     assert(fieldHeaderFromIndex(0) == "field1");
560     assert(fieldHeaderFromIndex(10) == "field11");
561 }
562 
563 /** Produce a summary header from a field header.
564  *
565  * The result has the form `<fieldHeader>_<operation>`. e.g. If the field header is
566  * "length" and the operation is "max", the summary header is "length_max". The field
567  * header typically comes a header line in the input data or was constructed by
568  * fieldHeaderFromIndex().
569  *
570  * If operationName is the empty string, then fieldHeader is used unchanged. This supports
571  * the Retain operator.
572  */
573 string summaryHeaderFromFieldHeader(string fieldHeader, string operationName)
574 {
575     return (operationName.length > 0) ? fieldHeader ~ "_" ~ operationName : fieldHeader;
576 }
577 
578 unittest
579 {
580     assert(summaryHeaderFromFieldHeader("originalfield", "mycalc") == "originalfield_mycalc");
581     assert(summaryHeaderFromFieldHeader("originalfield", "") == "originalfield");
582 }
583 
584 /** SummarizerPrintOptions holds printing options for Summarizers and Calculators. Typically
585  * specified with command line options, it is separated out for modularity.
586  */
587 struct SummarizerPrintOptions
588 {
589     char fieldDelimiter;
590     char valuesDelimiter;
591     size_t floatPrecision = 12;
592 
593     import std.traits : isFloatingPoint, isIntegral;
594 
595     auto formatNumber(T)(T n) const
596     if (isFloatingPoint!T || isIntegral!T)
597     {
598         import tsv_utils.common.numerics : formatNumber;
599         return formatNumber!T(n, floatPrecision);
600     }
601 }
602 
603 /** A Summarizer object maintains the state of the summarization and performs basic
604  * processing. Handling of files and input lines is left to the caller.
605  *
606  * Classes supporting the Summarizer must implement the methods:
607  *  - setOperators - Called after initializing the object for each operator to be processed.
608  *  - processHeaderLine - Called to process the header line of each file. Returns true if
609  *   it was the first header line processed (used when reading multiple files).
610  * - processNextLine - Called to process non-header lines.
611  * - writeSummaryHeader - Called to write the header line.
612  * - writeSummaryBody - Called to write the result lines.
613  *
614  */
615 interface Summarizer(OutputRange)
616 {
617     /** Called after initializing the object for each operator to be processed. */
618     void setOperators(InputRange!Operator op);
619 
620     /** Called to process the header line of each file. Returns true if it was the
621      *  first header line processed (used when reading multiple files).
622      */
623     bool processHeaderLine(const char[][] lineFields);
624 
625     /** Called to process non-header lines. */
626     void processNextLine(const char[][] lineFields);
627 
628     /** Called to write the header line. */
629     void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions);
630 
631     /** Called to write the result lines. */
632     void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions);
633 }
634 
635 /** SummarizerBase performs work shared by all sumarizers, most everything except for
636  * handling of unique keys.
637  *
638  * The base class handles creation, allocates storage for Operators and SharedFieldValues,
639  * and similar. Derived classes deal primarily with unique keys and the associated Calculators
640  * and UniqueKeyValuesLists.
641  */
642 class SummarizerBase(OutputRange) : Summarizer!OutputRange
643 {
644     private char _inputFieldDelimiter;
645     private bool _hasProcessedFirstHeaderLine = false;
646     private SharedFieldValues _sharedFieldValues = null;  // Null if no shared field value lists.
647     protected MissingFieldPolicy _missingPolicy;
648     protected DList!Operator _operators;
649     protected size_t _numOperators = 0;
650 
651     this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
652     {
653         _inputFieldDelimiter = inputFieldDelimiter;
654         _missingPolicy = missingPolicy;
655     }
656 
657     char inputFieldDelimiter() const @property
658     {
659         return _inputFieldDelimiter;
660     }
661 
662     /** Sets the Operators used by the Summarizer. Called after construction. */
663     void setOperators(InputRange!Operator operators)
664     {
665         foreach (op; operators)
666         {
667             _operators.insertBack(op);
668             _numOperators++;
669             auto numericFieldsToSave = op.numericFieldsToSave();
670             auto textFieldsToSave = op.textFieldsToSave();
671 
672             if (numericFieldsToSave.length > 0 || textFieldsToSave.length > 0)
673             {
674                 if (_sharedFieldValues is null)
675                 {
676                     _sharedFieldValues = new SharedFieldValues();
677                 }
678                 numericFieldsToSave.each!(x => _sharedFieldValues.addNumericIndex(x));
679                 textFieldsToSave.each!(x => _sharedFieldValues.addTextIndex(x));
680             }
681         }
682     }
683 
684     /** Called to process the header line of each file. Returns true if it was the
685      *  first header line processed (used when reading multiple files).
686      */
687     bool processHeaderLine(const char[][] lineFields)
688     {
689         if (!_hasProcessedFirstHeaderLine)
690         {
691             _operators.each!(x => x.processHeaderLine(lineFields));
692             _hasProcessedFirstHeaderLine = true;
693             return true;
694         }
695         else
696         {
697             return false;
698         }
699     }
700 
701     protected final UniqueKeyValuesLists makeUniqueKeyValuesLists()
702     {
703         return (_sharedFieldValues is null)
704             ? null
705             : _sharedFieldValues.makeUniqueKeyValuesLists;
706     }
707 
708     abstract void processNextLine(const char[][] lineFields);
709     abstract void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions);
710     abstract void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions);
711 }
712 
713 /** The NoKeySummarizer is used when summarizing values across the entire input.
714  *
715  * Note: NoKeySummarizer is used in Operator unit tests and gets extensive testing
716  * through that mechanism.
717  */
718 final class NoKeySummarizer(OutputRange) : SummarizerBase!OutputRange
719 {
720     private Calculator[] _calculators;
721     private UniqueKeyValuesLists _valueLists;
722 
723     this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
724     {
725         super(inputFieldDelimiter, missingPolicy);
726     }
727 
728     /** Called after initializing the object for each operator to be processed. */
729     override void setOperators(InputRange!Operator operators)
730     {
731         super.setOperators(operators);
732 
733         /* Only one Calculator per Operation, so create them as Operators are added. */
734         foreach (op; operators) _calculators ~= op.makeCalculator;
735         _valueLists = super.makeUniqueKeyValuesLists();
736     }
737 
738      /** Called to process non-header lines. */
739     override void processNextLine(const char[][] lineFields)
740     {
741         _calculators.each!(x => x.processNextLine(lineFields));
742         if (_valueLists !is null) _valueLists.processNextLine(lineFields, _missingPolicy);
743     }
744 
745     /** Called to write the header line. */
746     override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
747     {
748         put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter));
749         put(outputStream, '\n');
750     }
751 
752     /** Called to write the result lines. */
753     override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
754     {
755         put(outputStream,
756             _calculators[]
757             .map!(x => x.calculate(_valueLists, printOptions))
758             .join(printOptions.fieldDelimiter));
759         put(outputStream, '\n');
760     }
761 }
762 
763 /** KeySummarizerBase does work shared by the single key and multi-key summarizers.
764  *
765  * The primary difference between those two is the formation of the key. The primary
766  * reason for separating those into two separate classes is to simplify (speed-up)
767  * handling of single field keys, which are the most common use case.
768  */
769 class KeySummarizerBase(OutputRange) : SummarizerBase!OutputRange
770 {
771     protected struct UniqueKeyData
772     {
773         Calculator[] calculators;
774         UniqueKeyValuesLists valuesLists;
775     }
776 
777     private DList!string _uniqueKeys;
778     private UniqueKeyData[string] _uniqueKeyData;
779 
780     this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
781     {
782         super(inputFieldDelimiter, missingPolicy);
783     }
784 
785     protected void processNextLineWithKey(T : const char[])(T key, const char[][] lineFields)
786     {
787         debug writefln("[%s]: %s", __FUNCTION__, lineFields.to!string);
788 
789         auto dataPtr = (key in _uniqueKeyData);
790         auto data = (dataPtr is null) ? addUniqueKey(key.to!string) : *dataPtr;
791 
792         data.calculators.each!(x => x.processNextLine(lineFields));
793         if (data.valuesLists !is null) data.valuesLists.processNextLine(lineFields, _missingPolicy);
794     }
795 
796     protected UniqueKeyData addUniqueKey(string key)
797     {
798         assert(key !in _uniqueKeyData);
799 
800         _uniqueKeys.insertBack(key);
801 
802         auto calculators = new Calculator[_numOperators];
803         size_t i = 0;
804         foreach (op; _operators)
805         {
806             calculators[i] = op.makeCalculator;
807             i++;
808         }
809 
810         return _uniqueKeyData[key] = UniqueKeyData(calculators, super.makeUniqueKeyValuesLists());
811     }
812 
813     override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
814     {
815         put(outputStream, keyFieldHeader());
816         put(outputStream, printOptions.fieldDelimiter);
817         put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter));
818         put(outputStream, '\n');
819     }
820 
821     override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
822     {
823         foreach(key; _uniqueKeys)
824         {
825             auto data = _uniqueKeyData[key];
826             put(outputStream, key);
827             put(outputStream, printOptions.fieldDelimiter);
828             put(outputStream,
829                 data.calculators[]
830                 .map!(x => x.calculate(data.valuesLists, printOptions))
831                 .join(printOptions.fieldDelimiter));
832             put(outputStream, '\n');
833         }
834     }
835 
836     abstract string keyFieldHeader() const @property;
837 }
838 
839 /** This Summarizer is for the case where the unique key is based on exactly one field.
840  */
841 final class OneKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange
842 {
843     private size_t _keyFieldIndex = 0;
844     private string _keyFieldHeader;
845     private DList!string _uniqueKeys;
846 
847     this(size_t keyFieldIndex, char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
848     {
849         super(inputFieldDelimiter, missingPolicy);
850         _keyFieldIndex = keyFieldIndex;
851         _keyFieldHeader = fieldHeaderFromIndex(keyFieldIndex);
852     }
853 
854     override string keyFieldHeader() const @property
855     {
856         return _keyFieldHeader;
857     }
858 
859     override bool processHeaderLine(const char[][] lineFields)
860     {
861         assert(_keyFieldIndex <= lineFields.length);
862 
863         bool isFirstHeaderLine = super.processHeaderLine(lineFields);
864         if (isFirstHeaderLine)
865         {
866             _keyFieldHeader = lineFields[_keyFieldIndex].to!string;
867         }
868         return isFirstHeaderLine;
869     }
870 
871     override void processNextLine(const char[][] lineFields)
872     {
873         assert(_keyFieldIndex < lineFields.length);
874         processNextLineWithKey(lineFields[_keyFieldIndex], lineFields);
875     }
876 }
877 
878 /** This Summarizer is for the case where the unique key is based on multiple fields.
879  */
880 final class MultiKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange
881 {
882     private size_t[] _keyFieldIndices;
883     private string _keyFieldHeader;
884     private DList!string _uniqueKeys;
885 
886     this(const size_t[] keyFieldIndices, char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
887     {
888         super(inputFieldDelimiter, missingPolicy);
889         _keyFieldIndices = keyFieldIndices.dup;
890         _keyFieldHeader =
891             _keyFieldIndices.map!(i => fieldHeaderFromIndex(i))
892             .join(inputFieldDelimiter);
893     }
894 
895     override string keyFieldHeader() const @property
896     {
897         return _keyFieldHeader;
898     }
899 
900     override bool processHeaderLine(const char[][] lineFields)
901     {
902         assert(_keyFieldIndices.all!(x => x < lineFields.length));
903         assert(_keyFieldIndices.length >= 2);
904 
905         bool isFirstHeaderLine = super.processHeaderLine(lineFields);
906         if (isFirstHeaderLine)
907         {
908             _keyFieldHeader = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string;
909         }
910         return isFirstHeaderLine;
911     }
912 
913     override void processNextLine(const char[][] lineFields)
914     {
915         assert(_keyFieldIndices.all!(x => x < lineFields.length));
916         assert(_keyFieldIndices.length >= 2);
917 
918         string key = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string;
919         processNextLineWithKey(key, lineFields);
920     }
921 }
922 
923 version(unittest)
924 {
925     /* testSummarizer is a helper that can run many types of unit tests against
926      * Summarizers. It can also test operators, but there are separate helper functions
927      * better suited for that purpose.
928      *
929      * Arguments are a command line args, an input file, and expected output. The
930      * input file and expected output are already split into lines and fields, the helper
931      * manages re-assembly. The program name from the command line args is printed if an
932      * an error occurs, it is useful to identify the test that failed.
933      *
934      * Note: Much of this is a duplication tsvSummarize logic. Better abstraction of
935      * file input/output would enable running unit tests directly on top of tsvSummarize.
936      */
937     void testSummarizer(string[] cmdArgs, string[][] file, string[][] expected)
938     {
939         import std.array : appender;
940 
941         assert(cmdArgs.length > 0, "[testSummarizer] cmdArgs must not be empty.");
942 
943         auto formatAssertMessage(T...)(string msg, T formatArgs)
944         {
945             auto formatString = "[testSummarizer] %s: " ~ msg;
946             return format(formatString, cmdArgs[0], formatArgs);
947         }
948 
949         TsvSummarizeOptions cmdopt;
950         auto savedCmdArgs = cmdArgs.to!string;
951         auto r = cmdopt.processArgs(cmdArgs);
952         assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
953 
954         assert(file.all!(line => line.length >= cmdopt.endFieldIndex),
955                formatAssertMessage("group-by or operator field number greater than number of fields a line of the input file."));
956 
957         /* Pick the Summarizer based on the number of key-fields entered. */
958         auto summarizer =
959             (cmdopt.keyFields.length == 0)
960             ? new NoKeySummarizer!(typeof(appender!(char[])()))(
961                 cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
962 
963             : (cmdopt.keyFields.length == 1)
964             ? new OneKeySummarizer!(typeof(appender!(char[])()))(
965                 cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
966 
967             : new MultiKeySummarizer!(typeof(appender!(char[])()))(
968                 cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy);
969 
970         /* Add the operators to the Summarizer. */
971         summarizer.setOperators(inputRangeObject(cmdopt.operators[]));
972 
973         /* Process the file one line at a time. */
974         auto lineFields = new char[][](cmdopt.endFieldIndex);
975         bool headerFound = false;
976         foreach (lineNum, line; file.enumerate(1))
977         {
978             /* Copy the needed fields to the fields array. */
979             foreach (i, val; line[0..cmdopt.endFieldIndex]) lineFields[i] = val.dup;
980 
981             if (cmdopt.hasHeader && lineNum == 1)
982             {
983                 if (!headerFound)
984                 {
985                     summarizer.processHeaderLine(lineFields);
986                     headerFound = true;
987                 }
988             }
989             else
990             {
991                 try summarizer.processNextLine(lineFields);
992                 catch (Exception exc)
993                 {
994                     assert(false, formatAssertMessage(exc.msg));
995                 }
996             }
997         }
998         auto printOptions = SummarizerPrintOptions(
999         cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision);
1000 
1001         auto summarizerOutput = appender!(char[])();
1002 
1003         if (cmdopt.hasHeader || cmdopt.writeHeader)
1004         {
1005             summarizer.writeSummaryHeader(summarizerOutput, printOptions);
1006         }
1007 
1008         summarizer.writeSummaryBody(summarizerOutput, printOptions);
1009         auto expectedOutput = expected.map!(x => x.joiner(cmdopt.inputFieldDelimiter.to!string)).joiner("\n").to!string;
1010         if (expectedOutput.length > 0 && expectedOutput[$-1] != '\n') expectedOutput ~= "\n";
1011 
1012         assert(summarizerOutput.data == expectedOutput,
1013                formatAssertMessage(
1014                    "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
1015                    expectedOutput.to!string, summarizerOutput.data.to!string));
1016     }
1017 }
1018 
1019 unittest
1020 {
1021     /* Summarizer unit tests. Primarily single-key and multi-key summarizers. To a limited
1022      * extent, command line option handling (TsvSummarizeOptions). Individual operators
1023      * have separate tests, those tests test the no-key summarizer. The Values operator is
1024      * used in these tests. It engages a number of behaviors, and the results have limited
1025      * ambiguity. Using only one operator limits dependence on individual operators.
1026      */
1027 
1028     auto file1 = [["fld1", "fld2", "fld3"],
1029                   ["a", "a",  "3"],
1030                   ["c", "a",  "2b"],
1031                   ["c", "bc", ""],
1032                   ["a", "c",  "2b"],
1033                   ["",  "bc", ""],
1034                   ["c", "bc", "3"]];
1035 
1036     /* Single-key summarizer tests.
1037      */
1038     testSummarizer(["unittest-sk-1", "--header", "--group-by", "1", "--values", "1"],
1039                    file1,
1040                    [["fld1", "fld1_values"],
1041                     ["a", "a|a"],
1042                     ["c", "c|c|c"],
1043                     ["",  ""]]
1044         );
1045     testSummarizer(["unittest-sk-2", "-H", "--group-by", "1", "--values", "2"],
1046                    file1,
1047                    [["fld1", "fld2_values"],
1048                     ["a", "a|c"],
1049                     ["c", "a|bc|bc"],
1050                     ["",  "bc"]]
1051         );
1052     testSummarizer(["unittest-sk-3", "-H", "-g", "1", "--values", "3"],
1053                    file1,
1054                    [["fld1", "fld3_values"],
1055                     ["a", "3|2b"],
1056                     ["c", "2b||3"],
1057                     ["",  ""]]
1058         );
1059     testSummarizer(["unittest-sk-4", "-H", "--group-by", "1", "--values", "1,2,3"],
1060                    file1,
1061                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1062                     ["a", "a|a",   "a|c",     "3|2b"],
1063                     ["c", "c|c|c", "a|bc|bc", "2b||3"],
1064                     ["",  "",      "bc",      ""]]
1065         );
1066     testSummarizer(["unittest-sk-5", "-H", "--group-by", "1", "--values", "1-3"],
1067                    file1,
1068                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1069                     ["a", "a|a",   "a|c",     "3|2b"],
1070                     ["c", "c|c|c", "a|bc|bc", "2b||3"],
1071                     ["",  "",      "bc",      ""]]
1072         );
1073     testSummarizer(["unittest-sk-6", "-H", "--group-by", "1", "--values", "3,2,1"],
1074                    file1,
1075                    [["fld1", "fld3_values", "fld2_values", "fld1_values"],
1076                     ["a", "3|2b",  "a|c",     "a|a"],
1077                     ["c", "2b||3", "a|bc|bc", "c|c|c"],
1078                     ["",  "",      "bc",      ""]]
1079         );
1080     testSummarizer(["unittest-sk-7", "-H", "--group-by", "1", "--values", "3-1"],
1081                    file1,
1082                    [["fld1", "fld3_values", "fld2_values", "fld1_values"],
1083                     ["a", "3|2b",  "a|c",     "a|a"],
1084                     ["c", "2b||3", "a|bc|bc", "c|c|c"],
1085                     ["",  "",      "bc",      ""]]
1086         );
1087     testSummarizer(["unittest-sk-8", "-H", "--group-by", "2", "--values", "1"],
1088                    file1,
1089                    [["fld2", "fld1_values"],
1090                     ["a",  "a|c"],
1091                     ["bc", "c||c"],
1092                     ["c",  "a"]]
1093         );
1094     testSummarizer(["unittest-sk-9", "-H", "--group-by", "2", "--values", "2"],
1095                    file1,
1096                    [["fld2", "fld2_values"],
1097                     ["a",  "a|a"],
1098                     ["bc", "bc|bc|bc"],
1099                     ["c",  "c"]]
1100         );
1101     testSummarizer(["unittest-sk-10", "-H", "--group-by", "2", "--values", "3"],
1102                    file1,
1103                    [["fld2", "fld3_values"],
1104                     ["a",  "3|2b"],
1105                     ["bc", "||3"],
1106                     ["c",  "2b"]]
1107         );
1108     testSummarizer(["unittest-sk-11", "-H", "--group-by", "2", "--values", "1,3"],
1109                    file1,
1110                    [["fld2", "fld1_values", "fld3_values"],
1111                     ["a",  "a|c",  "3|2b"],
1112                     ["bc", "c||c", "||3"],
1113                     ["c",  "a",    "2b"]]
1114         );
1115     testSummarizer(["unittest-sk-12", "-H", "--group-by", "2", "--values", "3,1"],
1116                    file1,
1117                    [["fld2", "fld3_values", "fld1_values"],
1118                     ["a",  "3|2b", "a|c"],
1119                     ["bc", "||3",  "c||c"],
1120                     ["c",  "2b",   "a"]]
1121         );
1122     testSummarizer(["unittest-sk-13", "-H", "--group-by", "3", "--values", "1"],
1123                    file1,
1124                    [["fld3", "fld1_values"],
1125                     ["3",  "a|c"],
1126                     ["2b", "c|a"],
1127                     ["",   "c|"]]
1128         );
1129     testSummarizer(["unittest-sk-14", "-H", "--group-by", "3", "--values", "2"],
1130                    file1,
1131                    [["fld3", "fld2_values"],
1132                     ["3",  "a|bc"],
1133                     ["2b", "a|c"],
1134                     ["",   "bc|bc"]]
1135         );
1136     testSummarizer(["unittest-sk-15", "-H", "--group-by", "3", "--values", "1,2"],
1137                    file1,
1138                    [["fld3", "fld1_values", "fld2_values"],
1139                     ["3",  "a|c", "a|bc"],
1140                     ["2b", "c|a", "a|c"],
1141                     ["",   "c|",  "bc|bc"]]
1142         );
1143 
1144     /* Multi-key summarizer tests.
1145      */
1146     testSummarizer(["unittest-mk-1", "--header", "--group-by", "1,2", "--values", "1"],
1147                    file1,
1148                    [["fld1", "fld2", "fld1_values"],
1149                     ["a", "a",  "a"],
1150                     ["c", "a",  "c"],
1151                     ["c", "bc", "c|c"],
1152                     ["a", "c",  "a"],
1153                     ["", "bc",  ""]]
1154         );
1155     testSummarizer(["unittest-mk-2", "-H", "--group-by", "1,2", "--values", "2"],
1156                    file1,
1157                    [["fld1", "fld2", "fld2_values"],
1158                     ["a", "a",  "a"],
1159                     ["c", "a",  "a"],
1160                     ["c", "bc", "bc|bc"],
1161                     ["a", "c",  "c"],
1162                     ["", "bc",  "bc"]]
1163         );
1164     testSummarizer(["unittest-mk-3", "-H", "--group-by", "1,2", "--values", "3"],
1165                    file1,
1166                    [["fld1", "fld2", "fld3_values"],
1167                     ["a", "a",  "3"],
1168                     ["c", "a",  "2b"],
1169                     ["c", "bc", "|3"],
1170                     ["a", "c",  "2b"],
1171                     ["", "bc",  ""]]
1172         );
1173     testSummarizer(["unittest-mk-4", "-H", "--group-by", "1,2", "--values", "3,1"],
1174                    file1,
1175                    [["fld1", "fld2", "fld3_values", "fld1_values"],
1176                     ["a", "a",  "3", "a"],
1177                     ["c", "a",  "2b", "c"],
1178                     ["c", "bc", "|3", "c|c"],
1179                     ["a", "c",  "2b", "a"],
1180                     ["",  "bc", "",   ""]]
1181         );
1182     testSummarizer(["unittest-mk-5", "-H", "--group-by", "3,2", "--values", "1"],
1183                    file1,
1184                    [["fld3", "fld2", "fld1_values"],
1185                     ["3",  "a",  "a"],
1186                     ["2b", "a",  "c"],
1187                     ["",   "bc", "c|"],
1188                     ["2b", "c",  "a"],
1189                     ["3",  "bc", "c"]]
1190         );
1191     testSummarizer(["unittest-mk-6", "-H", "--group-by", "3-2", "--values", "1"],
1192                    file1,
1193                    [["fld3", "fld2", "fld1_values"],
1194                     ["3",  "a",  "a"],
1195                     ["2b", "a",  "c"],
1196                     ["",   "bc", "c|"],
1197                     ["2b", "c",  "a"],
1198                     ["3",  "bc", "c"]]
1199         );
1200     testSummarizer(["unittest-mk-7", "-H", "--group-by", "2,1,3", "--values", "2"],
1201                    file1,
1202                    [["fld2", "fld1", "fld3", "fld2_values"],
1203                     ["a",  "a", "3",  "a"],
1204                     ["a",  "c", "2b", "a"],
1205                     ["bc", "c", "",   "bc"],
1206                     ["c",  "a", "2b", "c"],
1207                     ["bc", "",  "",   "bc"],
1208                     ["bc", "c", "3",  "bc"]]
1209         );
1210 
1211     /* Missing policies. */
1212     testSummarizer(["unittest-mis-1", "--header", "--group-by", "1", "--values", "1", "--exclude-missing"],
1213                    file1,
1214                    [["fld1", "fld1_values"],
1215                     ["a", "a|a"],
1216                     ["c", "c|c|c"],
1217                     ["",  ""]]
1218         );
1219     testSummarizer(["unittest-mis-2", "-H", "--group-by", "1", "--values", "2", "-x"],
1220                    file1,
1221                    [["fld1", "fld2_values"],
1222                     ["a", "a|c"],
1223                     ["c", "a|bc|bc"],
1224                     ["",  "bc"]]
1225         );
1226     testSummarizer(["unittest-mis-3", "-H", "-g", "1", "--values", "3", "-x"],
1227                    file1,
1228                    [["fld1", "fld3_values"],
1229                     ["a", "3|2b"],
1230                     ["c", "2b|3"],
1231                     ["",  ""]]
1232         );
1233     testSummarizer(["unittest-mis-4", "-H", "--group-by", "1", "--values", "1,2,3", "-x"],
1234                    file1,
1235                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1236                     ["a", "a|a",   "a|c",     "3|2b"],
1237                     ["c", "c|c|c", "a|bc|bc", "2b|3"],
1238                     ["",  "",      "bc",      ""]]
1239         );
1240     testSummarizer(["unittest-mis-5", "--header", "--group-by", "1", "--values", "1", "--replace-missing", "NA"],
1241                    file1,
1242                    [["fld1", "fld1_values"],
1243                     ["a", "a|a"],
1244                     ["c", "c|c|c"],
1245                     ["",  "NA"]]
1246         );
1247     testSummarizer(["unittest-mis-6", "-H", "--group-by", "1", "--values", "2", "-r", "NA"],
1248                    file1,
1249                    [["fld1", "fld2_values"],
1250                     ["a", "a|c"],
1251                     ["c", "a|bc|bc"],
1252                     ["",  "bc"]]
1253         );
1254     testSummarizer(["unittest-mis-7", "-H", "-g", "1", "--values", "3", "-r", "NA"],
1255                    file1,
1256                    [["fld1", "fld3_values"],
1257                     ["a", "3|2b"],
1258                     ["c", "2b|NA|3"],
1259                     ["",  "NA"]]
1260         );
1261     testSummarizer(["unittest-mis-8", "-H", "--group-by", "1", "--values", "1,2,3", "-r", "NA"],
1262                    file1,
1263                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1264                     ["a", "a|a",   "a|c",     "3|2b"],
1265                     ["c", "c|c|c", "a|bc|bc", "2b|NA|3"],
1266                     ["",  "NA",      "bc",      "NA"]]
1267         );
1268     testSummarizer(["unittest-mis-9", "-H", "--group-by", "1,2", "--values", "3,1", "-x"],
1269                    file1,
1270                    [["fld1", "fld2", "fld3_values", "fld1_values"],
1271                     ["a", "a",  "3", "a"],
1272                     ["c", "a",  "2b", "c"],
1273                     ["c", "bc", "3", "c|c"],
1274                     ["a", "c",  "2b", "a"],
1275                     ["",  "bc", "",   ""]]
1276         );
1277     testSummarizer(["unittest-mis-10", "-H", "--group-by", "3,2", "--values", "1", "-x"],
1278                    file1,
1279                    [["fld3", "fld2", "fld1_values"],
1280                     ["3",  "a",  "a"],
1281                     ["2b", "a",  "c"],
1282                     ["",   "bc", "c"],
1283                     ["2b", "c",  "a"],
1284                     ["3",  "bc", "c"]]
1285         );
1286     testSummarizer(["unittest-mis-11", "-H", "--group-by", "2,1,3", "--values", "2", "-x"],
1287                    file1,
1288                    [["fld2", "fld1", "fld3", "fld2_values"],
1289                     ["a",  "a", "3",  "a"],
1290                     ["a",  "c", "2b", "a"],
1291                     ["bc", "c", "",   "bc"],
1292                     ["c",  "a", "2b", "c"],
1293                     ["bc", "",  "",   "bc"],
1294                     ["bc", "c", "3",  "bc"]]
1295         );
1296     testSummarizer(["unittest-mis-12", "-H", "--group-by", "1,2", "--values", "3,1", "-r", "NA"],
1297                    file1,
1298                    [["fld1", "fld2", "fld3_values", "fld1_values"],
1299                     ["a", "a",  "3", "a"],
1300                     ["c", "a",  "2b", "c"],
1301                     ["c", "bc", "NA|3", "c|c"],
1302                     ["a", "c",  "2b", "a"],
1303                     ["",  "bc", "NA",   "NA"]]
1304         );
1305     testSummarizer(["unittest-mis-13", "-H", "--group-by", "3,2", "--values", "1", "-r", "NA"],
1306                    file1,
1307                    [["fld3", "fld2", "fld1_values"],
1308                     ["3",  "a",  "a"],
1309                     ["2b", "a",  "c"],
1310                     ["",   "bc", "c|NA"],
1311                     ["2b", "c",  "a"],
1312                     ["3",  "bc", "c"]]
1313         );
1314     testSummarizer(["unittest-mis-14", "-H", "--group-by", "2,1,3", "--values", "2", "-r", "NA"],
1315                    file1,
1316                    [["fld2", "fld1", "fld3", "fld2_values"],
1317                     ["a",  "a", "3",  "a"],
1318                     ["a",  "c", "2b", "a"],
1319                     ["bc", "c", "",   "bc"],
1320                     ["c",  "a", "2b", "c"],
1321                     ["bc", "",  "",   "bc"],
1322                     ["bc", "c", "3",  "bc"]]
1323         );
1324 
1325     /* Validate that the no-key summarizer works with testSummarizer helper function.
1326      */
1327     testSummarizer(["unittest-nk-1", "-H", "--values", "1,2"],
1328                    file1,
1329                    [["fld1_values", "fld2_values"],
1330                     ["a|c|c|a||c", "a|a|bc|c|bc|bc"]]
1331         );
1332 
1333     /* Header variations: no header line; auto-generated header line; custom headers.
1334      */
1335     testSummarizer(["unittest-hdr-1", "--group-by", "1", "--values", "1"],
1336                    file1[1..$],
1337                    [["a", "a|a"],
1338                     ["c", "c|c|c"],
1339                     ["",  ""]]
1340         );
1341     testSummarizer(["unittest-hdr-2", "--group-by", "1,2", "--values", "2"],
1342                    file1[1..$],
1343                    [["a", "a",  "a"],
1344                     ["c", "a",  "a"],
1345                     ["c", "bc", "bc|bc"],
1346                     ["a", "c",  "c"],
1347                     ["", "bc",  "bc"]]
1348         );
1349     testSummarizer(["unittest-hdr-3", "--write-header", "--group-by", "2", "--values", "1"],
1350                    file1[1..$],
1351                    [["field2", "field1_values"],
1352                     ["a",  "a|c"],
1353                     ["bc", "c||c"],
1354                     ["c",  "a"]]
1355         );
1356     testSummarizer(["unittest-hdr-4", "-w", "--group-by", "3,2", "--values", "1"],
1357                    file1[1..$],
1358                    [["field3", "field2", "field1_values"],
1359                     ["3",  "a",  "a"],
1360                     ["2b", "a",  "c"],
1361                     ["",   "bc", "c|"],
1362                     ["2b", "c",  "a"],
1363                     ["3",  "bc", "c"]]
1364         );
1365     testSummarizer(["unittest-hdr-5", "-H", "--group-by", "2", "--values", "3:Field3Values"],
1366                    file1,
1367                    [["fld2", "Field3Values"],
1368                     ["a",  "3|2b"],
1369                     ["bc", "||3"],
1370                     ["c",  "2b"]]
1371         );
1372     testSummarizer(["unittest-hdr-6", "-H", "--group-by", "1,2", "--values", "3:FieldThreeValues", "--values", "1:FieldOneValues"],
1373                    file1,
1374                    [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"],
1375                     ["a", "a",  "3", "a"],
1376                     ["c", "a",  "2b", "c"],
1377                     ["c", "bc", "|3", "c|c"],
1378                     ["a", "c",  "2b", "a"],
1379                     ["",  "bc", "",   ""]]
1380         );
1381     testSummarizer(["unittest-hdr-7", "--write-header", "--group-by", "1", "--values", "3:f3_vals","--values", "2:f2_vals", "--values", "1:f1_vals"],
1382                    file1[1..$],
1383                    [["field1", "f3_vals", "f2_vals", "f1_vals"],
1384                     ["a", "3|2b",  "a|c",     "a|a"],
1385                     ["c", "2b||3", "a|bc|bc", "c|c|c"],
1386                     ["",  "",      "bc",      ""]]
1387         );
1388     testSummarizer(["unittest-hdr-8", "--write-header", "--group-by", "1,3,2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2"],
1389                    file1[1..$],
1390                    [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"],
1391                     ["a", "3",  "a",  "3",  "a", "a"],
1392                     ["c", "2b", "a",  "2b", "c", "a"],
1393                     ["c", "",   "bc", "",   "c", "bc"],
1394                     ["a", "2b", "c",  "2b", "a", "c"],
1395                     ["",  "",   "bc", "",   "",  "bc"],
1396                     ["c", "3",  "bc", "3",  "c", "bc"]]
1397         );
1398     testSummarizer(["unittest-hdr-9", "--write-header", "--group-by", "1,3-2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2"],
1399                    file1[1..$],
1400                    [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"],
1401                     ["a", "3",  "a",  "3",  "a", "a"],
1402                     ["c", "2b", "a",  "2b", "c", "a"],
1403                     ["c", "",   "bc", "",   "c", "bc"],
1404                     ["a", "2b", "c",  "2b", "a", "c"],
1405                     ["",  "",   "bc", "",   "",  "bc"],
1406                     ["c", "3",  "bc", "3",  "c", "bc"]]
1407         );
1408 
1409     /* Alternate file widths and lengths.
1410      */
1411 
1412     auto file3x2 = [["fld1", "fld2", "fld3"],
1413                     ["a", "b", "c"],
1414                     ["c", "b", "a"]];
1415 
1416     testSummarizer(["unittest-3x2-1", "-H", "--group-by", "1", "--values", "3"],
1417                    file3x2,
1418                    [["fld1", "fld3_values"],
1419                     ["a", "c"],
1420                     ["c", "a"]]
1421         );
1422     testSummarizer(["unittest-3x2-2", "-H", "--group-by", "2", "--values", "3"],
1423                    file3x2,
1424                    [["fld2", "fld3_values"],
1425                     ["b", "c|a"]]
1426         );
1427     testSummarizer(["unittest-3x2-3", "-H", "--group-by", "2,1", "--values", "3"],
1428                    file3x2,
1429                    [["fld2", "fld1", "fld3_values"],
1430                     ["b", "a", "c"],
1431                     ["b", "c", "a"]]
1432         );
1433 
1434     auto file3x1 = [["fld1", "fld2", "fld3"],
1435                     ["a", "b", "c"]];
1436 
1437     testSummarizer(["unittest-3x1-1", "-H", "--group-by", "1", "--values", "3"],
1438                    file3x1,
1439                    [["fld1", "fld3_values"],
1440                     ["a", "c"]]
1441         );
1442     testSummarizer(["unittest-3x1-2", "--group-by", "1", "--values", "3"],
1443                    file3x1[1..$],
1444                    [["a", "c"]]
1445         );
1446     testSummarizer(["unittest-3x1-3", "-H", "--group-by", "2,1", "--values", "3"],
1447                    file3x1,
1448                    [["fld2", "fld1", "fld3_values"],
1449                     ["b", "a", "c"]]
1450         );
1451     testSummarizer(["unittest-3x1-4", "--group-by", "2,1", "--values", "3"],
1452                    file3x1[1..$],
1453                    [["b", "a", "c"]]
1454         );
1455 
1456     auto file3x0 = [["fld1", "fld2", "fld3"]];
1457 
1458     testSummarizer(["unittest-3x0-1", "-H", "--group-by", "1", "--values", "3"],
1459                    file3x0,
1460                    [["fld1", "fld3_values"]]
1461         );
1462     testSummarizer(["unittest-3x0-2", "--group-by", "1", "--values", "3"],
1463                    file3x0[1..$],
1464                    []
1465         );
1466     testSummarizer(["unittest-3x0-3", "--write-header", "--group-by", "1", "--values", "3"],
1467                    file3x0[1..$],
1468                    [["field1", "field3_values"]]
1469         );
1470 
1471 
1472     testSummarizer(["unittest-3x0-4", "-H", "--group-by", "2,1", "--values", "3"],
1473                    file3x0,
1474                    [["fld2", "fld1", "fld3_values"]]
1475         );
1476 
1477     testSummarizer(["unittest-3x0-5", "--group-by", "2,1", "--values", "3"],
1478                    file3x0[1..$],
1479                    []
1480         );
1481 
1482     testSummarizer(["unittest-3x0-6", "--write-header", "--group-by", "2,1", "--values", "3"],
1483                    file3x0[1..$],
1484                    [["field2", "field1", "field3_values"]]
1485         );
1486 
1487     auto file2x1 = [["fld1", "fld2"],
1488                     ["a", "b"]];
1489 
1490     testSummarizer(["unittest-2x1-1", "-H", "--group-by", "1", "--values", "2"],
1491                    file2x1,
1492                    [["fld1", "fld2_values"],
1493                     ["a", "b"]]
1494         );
1495     testSummarizer(["unittest-2x1-2", "-H", "--group-by", "2,1", "--values", "1"],
1496                    file2x1,
1497                    [["fld2", "fld1", "fld1_values"],
1498                     ["b", "a", "a"]]
1499         );
1500 
1501     auto file2x0 = [["fld1", "fld2"]];
1502 
1503     testSummarizer(["unittest-2x0-1", "-H", "--group-by", "1", "--values", "2"],
1504                    file2x0,
1505                    [["fld1", "fld2_values"]]
1506         );
1507     testSummarizer(["unittest-2x0-2", "-H", "--group-by", "2,1", "--values", "1"],
1508                    file2x0,
1509                    [["fld2", "fld1", "fld1_values"]]
1510         );
1511 
1512     auto file1x2 = [["fld1"],
1513                     ["a"],
1514                     [""]];
1515 
1516     testSummarizer(["unittest-1x2-1", "-H", "--group-by", "1", "--values", "1"],
1517                    file1x2,
1518                    [["fld1", "fld1_values"],
1519                     ["a", "a"],
1520                     ["",  ""]]
1521         );
1522 
1523     auto file1x2b = [["fld1"],
1524                      [""],
1525                      [""]];
1526 
1527     testSummarizer(["unittest-1x2b-2", "-H", "--group-by", "1", "--values", "1"],
1528                    file1x2b,
1529                    [["fld1", "fld1_values"],
1530                     ["", "|"]]
1531         );
1532 
1533     auto file1x1 = [["fld1"],
1534                     ["x"]];
1535 
1536     testSummarizer(["unittest-1x1-1", "-H", "--group-by", "1", "--values", "1"],
1537                    file1x1,
1538                    [["fld1", "fld1_values"],
1539                     ["x", "x"]]
1540         );
1541 
1542     testSummarizer(["unittest-1x1-2", "--group-by", "1", "--values", "1"],
1543                    file1x1[1..$],
1544                    [["x", "x"]]
1545         );
1546 
1547     testSummarizer(["unittest-1x1-3", "--write-header", "--group-by", "1", "--values", "1"],
1548                    file1x1[1..$],
1549                    [["field1", "field1_values"],
1550                     ["x", "x"]]
1551         );
1552 
1553     auto file1x1b = [["fld1"],
1554                     [""]];
1555 
1556     testSummarizer(["unittest-1x1b-1", "-H", "--group-by", "1", "--values", "1"],
1557                    file1x1b,
1558                    [["fld1", "fld1_values"],
1559                     ["", ""]]
1560         );
1561 
1562     auto file1x0 = [["fld1"]];
1563 
1564     testSummarizer(["unittest-1x0-1", "-H", "--group-by", "1", "--values", "1"],
1565                    file1x0,
1566                    [["fld1", "fld1_values"]]
1567         );
1568 
1569     testSummarizer(["unittest-1x0-2", "--group-by", "1", "--values", "1"],
1570                    file1x0[1..$],
1571                    []
1572         );
1573 
1574     testSummarizer(["unittest-1x0-3", "--write-header", "--group-by", "1", "--values", "1"],
1575                    file1x0[1..$],
1576                    [["field1", "field1_values"]]
1577         );
1578 
1579     /* Alternate delimiters. */
1580     testSummarizer(["unittest-delim-1", "-H", "--values", "1,2", "--delimiter", "%"],
1581                    file1,
1582                    [["fld1_values", "fld2_values"],
1583                     ["a|c|c|a||c", "a|a|bc|c|bc|bc"]]
1584         );
1585     testSummarizer(["unittest-delim-2", "-H", "--values", "1-2", "--values-delimiter", "$"],
1586                    file1,
1587                    [["fld1_values", "fld2_values"],
1588                     ["a$c$c$a$$c", "a$a$bc$c$bc$bc"]]
1589         );
1590     testSummarizer(["unittest-delim-3", "-H", "--values", "1,2", "--delimiter", "#", "--values-delimiter", ","],
1591                    file1,
1592                    [["fld1_values", "fld2_values"],
1593                     ["a,c,c,a,,c", "a,a,bc,c,bc,bc"]]
1594         );
1595     testSummarizer(["unittest-delim-4", "--write-header", "--group-by", "2", "--values", "1",
1596                     "--delimiter", "^", "--values-delimiter", ":"],
1597                    file1[1..$],
1598                    [["field2", "field1_values"],
1599                     ["a",  "a:c"],
1600                     ["bc", "c::c"],
1601                     ["c",  "a"]]
1602         );
1603     testSummarizer(["unittest-delim-5", "--group-by", "1,2", "--values", "2", "--delimiter", "/",
1604                     "--values-delimiter", "\\"],
1605                    file1[1..$],
1606                    [["a", "a",  "a"],
1607                     ["c", "a",  "a"],
1608                     ["c", "bc", "bc\\bc"],
1609                     ["a", "c",  "c"],
1610                     ["", "bc",  "bc"]]
1611         );
1612 }
1613 
1614 /* Summary Operators and Calculators
1615  *
1616  * Two types of objects are used in implementation: Operators and Calculators. An Operator
1617  * represents a summary calculation specified on the command line, e.g. '--mean 5'. A
1618  * Calculator is used to manage the summary calculation for each unique key in the input.
1619  *
1620  * As an example, consider the command:
1621  *
1622  *    $tsv-summarize --group-by 1 --mean 3 --mean 5
1623  *
1624  * This command will create two instances of a MeanOperator, one each for fields 3 and 5.
1625  * They produce the output field headers (e.g. "field3_mean", "field5_mean"). They also
1626  * create MeanCalculator objects for each unique value in field 1. For 'mean', a
1627  * calculator needs to track occurrence count and sum. Calculators produce the final
1628  * value when all processing is finished.
1629  *
1630  * Summary field headers
1631  *
1632  * There are several options for specifying summary field headers. The defaults combine the
1633  * operator name and the header of the field summarized. The defaults can be overridden on
1634  * on the command line. These scenarios are supported via the operator constructor and the
1635  * processHeaderLine() method.
1636  *
1637  * Missing field policy
1638  *
1639  * At present, tsv-summarize has a single policy for handling missing values that applies
1640  * to all operators. However, it is logically operator specific and is implemented that
1641  * way. The MissingFieldPolicy struct describes the policy, each operator contains one.
1642  * Calculators access thier operator's policy struct.
1643  */
1644 
1645 /** An Operator represents a summary calculation specified on the command line.
1646  *  e.g. '--mean 5'.
1647  */
1648 interface Operator
1649 {
1650     @property string header();
1651     @property string name();
1652     void processHeaderLine(const char[][] fields);
1653     size_t[] numericFieldsToSave();     // Numeric fields this Operator needs saved
1654     size_t[] textFieldsToSave();        // Text fields this Operator needs saved
1655     Calculator makeCalculator();
1656 }
1657 
1658 /** Calculators are responsible for the calculation of a single computation. They
1659  *  process each line and produce the final value when all processing is finished.
1660  */
1661 interface Calculator
1662 {
1663     void processNextLine(const char[][] fields);
1664     string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions);
1665 }
1666 
1667 /** This class describes processing behavior when a missing value is encountered.
1668  */
1669 final class MissingFieldPolicy
1670 {
1671     private bool _useMissing = true;          // True if missing values are processed unchanged.
1672     private bool _replaceMissing = false;     // True if missing values are replaced.
1673     private string _missingReplacement;       // Replacement string if replaceMissing is true.
1674 
1675     this (in bool excludeMissing = false, in string missingReplacement = "")
1676     {
1677         updatePolicy(excludeMissing, missingReplacement);
1678     }
1679 
1680     void updatePolicy(in bool excludeMissing, in string missingReplacement)
1681     {
1682         _missingReplacement = missingReplacement;
1683         _replaceMissing = missingReplacement.length != 0;
1684         _useMissing = !excludeMissing && !replaceMissing;
1685     }
1686 
1687     final bool isMissingField(const char[] field) const
1688     {
1689         return field.length == 0;
1690     }
1691 
1692     final bool useMissing() const @property
1693     {
1694         return _useMissing;
1695     }
1696 
1697     final bool excludeMissing() const @property
1698     {
1699         return !_useMissing && !_replaceMissing;
1700     }
1701 
1702     final bool replaceMissing() const @property
1703     {
1704         return _replaceMissing;
1705     }
1706 
1707     final string missingReplacement() const @property
1708     {
1709         return _missingReplacement;
1710     }
1711 }
1712 
1713 /* The SharedFieldValues and UniqueKeyValuesLists classes manage lists of values collected
1714  * while reading data. Operations like median collect all values and operate on them when
1715  * running the final calculation. Value lists are needed for each unique key. A command
1716  * using multiple Operators may save multiple fields. And, different Operators may be run
1717  * against the same field.
1718  *
1719  * The last part motivates these classes. Handling large data sets necessitates minimizing
1720  * in-memory storage, making it desirable to share identical lists between Calculators.
1721  * Otherwise, each Calculator could implement its own storage, which would be simpler.
1722  *
1723  * The setup works as follows:
1724  *  - Operators advertise fields they need saved ([text|numeric]FieldsToSave methods).
1725  *  - The SummarizerBase object keeps a SharedFieldValues object, which in turn keeps list
1726  *    of the fields advertised by Operators as needing sharing. This list gets created
1727  *    during command initialization (SummarizerBase.setOperators).
1728  *  - The SharedFieldValues object is used to create a UniqueKeyValuesLists object every
1729  *    time a new unique key is found, in parellel to the Calculator objects created for the
1730  *    key. The UniqueKeyValuesLists objects are managed by derived Summarizer classes.
1731  *  - A unique key's UniqueKeyValuesLists object is passed each input line, same as
1732  *    Calculators, saving the values.
1733  *  - Calculators retrieve the saved values during the calculation phase. The calculator's
1734  *    ProcessNextField method is typically a no-op.
1735  *  - Calculators cannot make assumptions about the order of the saved values. This is
1736  *    pragmatic concession to median and quantile calculations, which need to sort the data,
1737  *    at least partially. Rather than generate sorted copies, the current algorithms
1738  *    sort the data in place.
1739  *
1740  * One concession to duplicate storage is that text and numeric versions of the same
1741  * field might be stored. The reason is because it's important to convert text to numbers
1742  * as they are read so that useful error messages can be generated. And, storing both
1743  * forms of the same field should be less common.
1744  *
1745  * The current implementation uses the same missing values policy for all fields. If
1746  * multiple policies become supported this will need to change.
1747  *
1748  * Built-in calculations - UniqueKeyValueLists have a built-in median operation. This is
1749  * to avoid repeated calculations of the median by different calculations.
1750  */
1751 
1752 final class SharedFieldValues
1753 {
1754     // Arrays with field indices that need to be saved.
1755     private size_t[] _numericFieldIndices;
1756     private size_t[] _textFieldIndices;
1757 
1758     /* Called during summarizer setup to add a shared field value for a specific field index.
1759      * eg. '--median 7' will add end up calling addNumericIdex(6), 6 being the zero-based index.
1760      * A specific index is only added once.
1761      */
1762     final void addNumericIndex (size_t index)
1763     {
1764         if (!canFind(_numericFieldIndices, index)) _numericFieldIndices ~= index;
1765     }
1766 
1767     /* Similar to addNumericIndex, except adds a text index. */
1768     final void addTextIndex (size_t index)
1769     {
1770         if (!canFind(_textFieldIndices, index)) _textFieldIndices ~= index;
1771     }
1772 
1773     /* Called every time a new key is found, or once at the beginning of the program if no keys
1774      * are being used (entire column summarized).
1775      */
1776     final UniqueKeyValuesLists makeUniqueKeyValuesLists()
1777     {
1778         return new UniqueKeyValuesLists(_numericFieldIndices, _textFieldIndices);
1779     }
1780 }
1781 
1782 final class UniqueKeyValuesLists
1783 {
1784     /* A FieldValues object holds is a list of values collect for a specific field. A
1785      * unique key may hold several. For example, the command:
1786      *     $ tsv-summarize --k 1 --median 4 -- median 5
1787      * requires keeping lists for both fields 4 and 5. This in turn will result in a
1788      * _numericFieldValues being a 2 element array, one with a list of field 4 values,
1789      * the second of field 5 values. Linear search is used to find a specific field.
1790      */
1791     private FieldValues!double[] _numericFieldValues;
1792     private FieldValues!string[] _textFieldValues;
1793     private double[] _numericFieldMedians;
1794 
1795     /* The UniqueKeyValuesLists constructor takes arrays of field indices to be saved. */
1796     this(const size_t[] numericFieldIndices, const size_t[] textFieldIndices)
1797     {
1798         if (numericFieldIndices.length > 0)
1799         {
1800             _numericFieldValues = new FieldValues!double[](numericFieldIndices.length);
1801             foreach (i, fieldIndex; numericFieldIndices)
1802                 _numericFieldValues[i] = new FieldValues!double(fieldIndex);
1803         }
1804 
1805         if (textFieldIndices.length > 0)
1806         {
1807             _textFieldValues = new FieldValues!string[](textFieldIndices.length);
1808             foreach (i, fieldIndex; textFieldIndices)
1809                 _textFieldValues[i] = new FieldValues!string(fieldIndex);
1810         }
1811     }
1812 
1813     void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy)
1814     {
1815         _numericFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy));
1816         _textFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy));
1817     }
1818 
1819     private FieldValues!double findNumericFieldValues(size_t index)
1820     {
1821         alias pred = (FieldValues!double a, size_t b) => (a.fieldIndex == b);
1822         auto r = find!pred(_numericFieldValues, index);
1823         assert(!r.empty);
1824         return r.front;
1825     }
1826 
1827     private FieldValues!string findTextFieldValues(size_t index)
1828     {
1829         alias pred = (FieldValues!string a, size_t b) => (a.fieldIndex == b);
1830         auto r = find!pred(_textFieldValues, index);
1831         assert(!r.empty);
1832         return r.front;
1833     }
1834 
1835     final double[] numericValues(size_t index)
1836     {
1837         return findNumericFieldValues(index).getArray;
1838     }
1839 
1840     final double[] numericValuesSorted(size_t index)
1841     {
1842         return findNumericFieldValues(index).getSortedArray;
1843     }
1844 
1845     final string[] textValues(size_t index)
1846     {
1847         return findTextFieldValues(index).getArray;
1848     }
1849 
1850     final string[] textValuesSorted(size_t index)
1851     {
1852         return findTextFieldValues(index).getSortedArray;
1853     }
1854 
1855     final double numericValuesMedian(size_t index)
1856     {
1857         return findNumericFieldValues(index).median;
1858     }
1859 
1860     private final class FieldValues(ValueType)
1861     {
1862         import std.array : appender;
1863         private size_t _fieldIndex;
1864         private Appender!(ValueType[]) _values;
1865         private bool _haveMedian = false;
1866         private bool _isSorted = false;
1867         private ValueType _medianValue;
1868 
1869         this(size_t fieldIndex)
1870         {
1871             _fieldIndex = fieldIndex;
1872         }
1873 
1874         final size_t length() const @property
1875         {
1876             return _values.data.length;
1877         }
1878 
1879         final size_t fieldIndex() const @property
1880         {
1881             return _fieldIndex;
1882         }
1883 
1884         final void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy)
1885         {
1886             debug writefln("[%s]: %s", __FUNCTION__, fields.to!string);
1887 
1888             const char[] field = fields[_fieldIndex];
1889             if (missingPolicy.useMissing || !missingPolicy.isMissingField(field))
1890             {
1891                 _values.put(field.to!ValueType);
1892                 _haveMedian = false;
1893                 _isSorted = false;
1894             }
1895             else if (missingPolicy.replaceMissing)
1896             {
1897                 _values.put(missingPolicy.missingReplacement.to!ValueType);
1898                 _haveMedian = false;
1899                 _isSorted = false;
1900             }
1901         }
1902 
1903         /* Return an input range of the values. */
1904         final auto values()
1905         {
1906             return _values.data;
1907         }
1908 
1909         final ValueType[] getArray()
1910         {
1911             return _values.data;
1912         }
1913 
1914         final ValueType[] getSortedArray()
1915         {
1916             if (!_isSorted)
1917             {
1918                 import std.algorithm : sort;
1919                 sort(_values.data);
1920                 _isSorted = true;
1921             }
1922             return _values.data;
1923         }
1924 
1925         final ValueType median()
1926         {
1927             if (!_haveMedian)
1928             {
1929                 import tsv_utils.common.numerics : rangeMedian;
1930                 _medianValue = _values.data.rangeMedian();
1931                 _haveMedian = true;
1932             }
1933 
1934             return _medianValue;
1935         }
1936     }
1937 }
1938 
1939 /** SingleFieldOperator is a base class for single field operators, the most common
1940  * Operator. Derived classes implement makeCalculator and the Calculator class it returns.
1941  */
1942 class SingleFieldOperator : Operator
1943 {
1944     import std.typecons : Flag;
1945 
1946     private string _name;
1947     private string _header;
1948     private size_t _fieldIndex;
1949     private bool _useHeaderSuffix;
1950     private bool _allowCustomHeader;
1951     private bool _hasCustomHeader = false;
1952     private size_t[] _numericFieldsToSave;
1953     private size_t[] _textFieldsToSave;
1954     private MissingFieldPolicy _missingPolicy;
1955 
1956     this(string operatorName, size_t fieldIndex, MissingFieldPolicy missingPolicy,
1957          Flag!"useHeaderSuffix" useHeaderSuffix = Yes.useHeaderSuffix,
1958          Flag!"allowCustomHeader" allowCustomHeader = Yes.allowCustomHeader)
1959     {
1960         _name = operatorName;
1961         _fieldIndex = fieldIndex;
1962         _missingPolicy = missingPolicy;
1963         _useHeaderSuffix = useHeaderSuffix;
1964         _allowCustomHeader = allowCustomHeader;
1965         // Default header. May be overrridden by custom header or header line.
1966         _header =
1967             fieldHeaderFromIndex(fieldIndex)
1968             .summaryHeaderFromFieldHeader(_useHeaderSuffix ? operatorName : "");
1969     }
1970 
1971     void setCustomHeader (string customHeader)
1972     {
1973         assert(_allowCustomHeader);
1974         _header = customHeader;
1975         _hasCustomHeader = true;
1976     }
1977 
1978     final string name() const @property
1979     {
1980         return _name;
1981     }
1982 
1983     final bool allowCustomHeader() const @property
1984     {
1985         return _allowCustomHeader;
1986     }
1987 
1988     /* saveFieldValues[Numeric|Text] are called by derived classes to indicate that field
1989      * that the field values should be saved. These should called during construction.
1990      */
1991     final void setSaveFieldValuesNumeric()
1992     {
1993         _numericFieldsToSave ~= _fieldIndex;
1994     }
1995 
1996     final void setSaveFieldValuesText()
1997     {
1998         _textFieldsToSave ~= _fieldIndex;
1999     }
2000 
2001     final MissingFieldPolicy missingPolicy() @property
2002     {
2003         return _missingPolicy;
2004     }
2005 
2006     final size_t fieldIndex() const @property
2007     {
2008         return _fieldIndex;
2009     }
2010 
2011     final string header() const @property
2012     {
2013         return _header;
2014     }
2015 
2016     final bool useHeaderSuffix() const @property
2017     {
2018         return _useHeaderSuffix;
2019     }
2020 
2021     void processHeaderLine(const char[][] fields)
2022     {
2023         if (!_hasCustomHeader) {
2024             debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string);
2025             _header = summaryHeaderFromFieldHeader(fields[_fieldIndex].to!string,
2026                                                    _useHeaderSuffix ? _name : "");
2027         }
2028     }
2029 
2030     final size_t[] numericFieldsToSave()
2031     {
2032         return _numericFieldsToSave;
2033     }
2034 
2035     final size_t[] textFieldsToSave()
2036     {
2037         return _textFieldsToSave;
2038     }
2039 
2040     abstract SingleFieldCalculator makeCalculator();
2041 }
2042 
2043 /** SingleFieldCalculator is a base class for the common case of calculators using a single
2044  * field. Derived classes implement processNextField() rather than processNextLine().
2045  */
2046 class SingleFieldCalculator : Calculator
2047 {
2048     private size_t _fieldIndex;
2049 
2050     this(size_t fieldIndex)
2051     {
2052         _fieldIndex = fieldIndex;
2053     }
2054 
2055     final size_t fieldIndex() const @property
2056     {
2057         return _fieldIndex;
2058     }
2059 
2060     final void processNextLine(const char[][] fields)
2061     {
2062         debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string);
2063 
2064         auto missingPolicy = getOperator.missingPolicy;
2065         const char[] field = fields[_fieldIndex];
2066 
2067         if (missingPolicy.useMissing || !missingPolicy.isMissingField(field))
2068         {
2069             processNextField(field);
2070         }
2071         else if (missingPolicy.replaceMissing)
2072         {
2073             processNextField(missingPolicy.missingReplacement);
2074         }
2075     }
2076 
2077     abstract SingleFieldOperator getOperator();
2078 
2079     abstract void processNextField(const char[] field);
2080 }
2081 
2082 /* Unittest helper functions. Only compiled when -unittest is in effect. */
2083 version(unittest)
2084 {
2085     /** A helper for SingleFieldOperator unit tests.
2086      *
2087      * testSingleFieldOperator takes a set of split file values, a field index, a header
2088      * suffix, and a set of expected values. The expected values array contains the
2089      * initial value (zero entries) and the expected values after each line. (One more
2090      * expected value than input lines.) The zero entry case is what is generated for an
2091      * empty file. An example testing the 'min' operator against a file with 2 columns,
2092      * 3 rows, using field index 1:
2093      *
2094      *    testSingleFieldOperator!MinOperator(
2095      *       [["10", "100"],               // The split file. 3 lines by 2 rows.
2096      *        ["5", "50"],
2097      *        ["20", "200"]],
2098      *       1,                            // Field index (zero-based, so "100", "50", "200")
2099      *       "min",                        // The header suffix, normally the operator name.
2100      *       ["nan", "100", "50", "50"]);  // Min value after processing each line.
2101      *
2102      * A typical operator unit test uses three "files", one each of 1x3, 2x3, and 3x3.
2103      * Then run the operator is tested against each column, a total of six calls. Headers
2104      * are automatically checked. Additional entries can be used to extend coverage.
2105      *
2106      * A non-default MissingFieldPolicy can be provide as an optional last argument.
2107      * Operator tests should include exclusion and replacement variations. See operator
2108      * unit tests for details.
2109      *
2110      * The testSingleFieldOperatorBase adds an additional capability - Custom operator
2111      * init arguments. Currently this is used only by the quantile operator.
2112      *
2113      * These tests do not check unique key behavior (group-by). Operators don't have info
2114      * about unique keys, and interact with them only indirectly, via Calculators.
2115      */
2116     void testSingleFieldOperator(OperatorClass : SingleFieldOperator)
2117         (const char[][][] splitFile, size_t fieldIndex, string headerSuffix,
2118          const char[][] expectedValues,
2119          MissingFieldPolicy missingPolicy = new MissingFieldPolicy)
2120     {
2121         testSingleFieldOperatorBase!OperatorClass(splitFile, fieldIndex, headerSuffix, expectedValues, missingPolicy);
2122     }
2123 
2124     void testSingleFieldOperatorBase(OperatorClass : SingleFieldOperator, T...)
2125         (const char[][][] splitFile, size_t fieldIndex, string headerSuffix,
2126          const char[][] expectedValues,
2127          MissingFieldPolicy missingPolicy,
2128          T extraOpInitArgs)
2129     {
2130         import std.format : format;
2131         import std.array : appender;
2132         import std..string : chomp;
2133         import std.traits : EnumMembers;
2134 
2135         auto numFields = (splitFile[0]).length;
2136 
2137         assert(fieldIndex < numFields,
2138                format("[testSingleFieldOperator] Invalid field index. headerSuffix: %s",
2139                       headerSuffix));
2140         assert(splitFile.length + 1 == expectedValues.length,
2141                format("[testSingleFieldOperator] Need one more expected value than number of rows. headerSuffix: %s",
2142                       headerSuffix));
2143 
2144         /* printOptions - Only the 'values-delimiter' (2nd arg) is used these tests. */
2145         auto printOptions = SummarizerPrintOptions('#', '|');
2146 
2147         /* An input header line. */
2148         string[] inputHeaderLine = new string[numFields];
2149         foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string;
2150 
2151         /* The different expected output field headers. */
2152         auto outputFieldHeaderWithNoHeaderLine =
2153             fieldHeaderFromIndex(fieldIndex)
2154             .summaryHeaderFromFieldHeader(headerSuffix);
2155         auto outputFieldHeaderFromHeaderLine =
2156             inputHeaderLine[fieldIndex]
2157             .summaryHeaderFromFieldHeader(headerSuffix);
2158         auto customOutputFieldHeader = "custom";
2159 
2160         enum HeaderUsecase {
2161             HeaderLine_DefaultHeader,
2162             HeaderLine_CustomHeader,
2163             NoHeaderLine_DefaultHeader,
2164             NoHeaderLine_CustomHeader,
2165             NoHeaderLine_NoOutputHeader,
2166         }
2167 
2168         string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected)
2169         {
2170             return format("[testSingleFieldOperator] Unexpected header. Operator: %s; Usecase: %s;  Actual: '%s';  Expected: '%s'",
2171                           op.name, hc, actual, expected);
2172         }
2173 
2174         string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex, size_t fieldIndex,
2175                                   const char[] actual, const char[] expected)
2176         {
2177             return format("[testSingleFieldOperator] Unexpected value. Operator: %s; Usecase: %s;  RowIndex: %d, FieldIndex: %d\n    Actual: '%s';  Expected: '%s'",
2178                           op.name, hc, rowIndex, fieldIndex, actual, expected);
2179         }
2180 
2181         /* Run the logic for each header use case. */
2182         foreach (hc; EnumMembers!HeaderUsecase)
2183         {
2184             bool hasInputHeader = (
2185                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2186                 hc == HeaderUsecase.HeaderLine_CustomHeader
2187                 );
2188             bool hasOutputHeader = (
2189                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2190                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2191                 hc == HeaderUsecase.NoHeaderLine_DefaultHeader ||
2192                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2193                 );
2194             bool hasCustomHeader = (
2195                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2196                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2197                 );
2198 
2199             if (hasCustomHeader) assert(hasOutputHeader);
2200 
2201             auto op = new OperatorClass(fieldIndex, missingPolicy, extraOpInitArgs);
2202 
2203             if (hasCustomHeader)
2204             {
2205                 if (!op.allowCustomHeader) continue;   // Custom header not support by this operator
2206                 op.setCustomHeader(customOutputFieldHeader);
2207             }
2208 
2209             Operator[] operatorArray;
2210             operatorArray ~= op;
2211 
2212             auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy);
2213             summarizer.setOperators(inputRangeObject(operatorArray));
2214 
2215             if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine);
2216 
2217             if (hasOutputHeader)
2218             {
2219                 /* Write the header line. Note that this is a one-field header, */
2220                 auto headerLineOutput = appender!(char[])();
2221                 summarizer.writeSummaryHeader(headerLineOutput, printOptions);
2222 
2223                 /* Test that the header was generated correctly.
2224                  *
2225                  * Note: Because the output is generated by a Summarizer, it will have a
2226                  * trailing newline. Use chomp to trim it.
2227                  */
2228                 final switch (hc)
2229                 {
2230                 case HeaderUsecase.HeaderLine_DefaultHeader:
2231                     assert(headerLineOutput.data.chomp == outputFieldHeaderFromHeaderLine,
2232                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2233                                                outputFieldHeaderFromHeaderLine));
2234                     break;
2235                 case HeaderUsecase.NoHeaderLine_DefaultHeader:
2236                     assert(headerLineOutput.data.chomp == outputFieldHeaderWithNoHeaderLine,
2237                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2238                                                outputFieldHeaderWithNoHeaderLine));
2239                     break;
2240                 case HeaderUsecase.HeaderLine_CustomHeader:
2241                 case HeaderUsecase.NoHeaderLine_CustomHeader:
2242                     assert(headerLineOutput.data.chomp == customOutputFieldHeader,
2243                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2244                                                customOutputFieldHeader));
2245                     break;
2246                 case HeaderUsecase.NoHeaderLine_NoOutputHeader:
2247                     break;
2248                }
2249 
2250             }
2251 
2252             /* For each line, process the line, generate the output, and test that the
2253              * value is correct. Start with the empty file case.
2254              */
2255             foreach (i, const char[] expected; expectedValues)
2256             {
2257                 if (i > 0) summarizer.processNextLine(splitFile[i - 1]);
2258                 auto summaryLineOutput = appender!(char[])();
2259                 summarizer.writeSummaryBody(summaryLineOutput, printOptions);
2260                 assert(summaryLineOutput.data.chomp == expected,
2261                        valueAssertMessage(operatorArray[0], hc, i, fieldIndex,
2262                                           summaryLineOutput.data.chomp, expectedValues[i]));
2263             }
2264         }
2265     }
2266 }
2267 
2268 /** ZeroFieldOperator is a base class for operators that take no input. The main use
2269  * case is the CountOperator, which counts the occurrences of each unique key. Other
2270  * uses are possible, for example, weighted random number assignment.
2271  *
2272  * The primary rationale for ZeroFieldOperator and ZeroFieldCalculator is to clarify
2273  * the information available to such a routine. In particular, the split fields passed
2274  * to processHeaderLine and processNextLine don't include all fields in the input,
2275  * something that might not be obvious when implementing an operator. (Only fields
2276  * required by operators acting on specific fields are included.)
2277  */
2278 class ZeroFieldOperator : Operator
2279 {
2280     import std.typecons : Flag;
2281 
2282     private string _name;
2283     private string _header;
2284 
2285     this(string operatorName)
2286     {
2287         _name = operatorName;
2288         _header = operatorName;
2289     }
2290 
2291     void setCustomHeader (string customHeader)
2292     {
2293         _header = customHeader;
2294     }
2295 
2296     bool allowCustomHeader() const @property
2297     {
2298         return true;
2299     }
2300 
2301     final string name() const @property
2302     {
2303         return _name;
2304     }
2305 
2306     final string header() const @property
2307     {
2308         return _header;
2309     }
2310 
2311     /* A no-op. ZeroFieldOperators have no access to the header line. */
2312     final void processHeaderLine(const char[][] fields) { }
2313 
2314     /* A no-op. ZeroFieldOperators have no access to fields. */
2315     final size_t[] numericFieldsToSave()
2316     {
2317         size_t[] emptyArray;
2318         return emptyArray;
2319     }
2320 
2321     /* A no-op. ZeroFieldOperators have no access to fields. */
2322     final size_t[] textFieldsToSave()
2323     {
2324         size_t[] emptyArray;
2325         return emptyArray;
2326     }
2327 
2328     abstract ZeroFieldCalculator makeCalculator();
2329 }
2330 
2331 /** ZeroFieldCalculator is a base class for operators that don't use fields as input.
2332  * In particular, the Count operator. It is a companion to the ZeroFieldOperator class.
2333  *
2334  * Derived classes implement processNextEntry() rather than processNextLine(), and the
2335  * single argument form of calculate() given as an abstract function.
2336  */
2337 class ZeroFieldCalculator : Calculator
2338 {
2339     this() { }
2340 
2341     final void processNextLine(const char[][] fields)
2342     {
2343         debug writefln("[%s]", __FUNCTION__,);
2344         processNextEntry();
2345     }
2346 
2347     final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2348     {
2349         return calculate(printOptions);
2350     }
2351 
2352     abstract void processNextEntry();
2353     abstract string calculate(const ref SummarizerPrintOptions printOptions);
2354 }
2355 
2356 version(unittest)
2357 {
2358     /* A helper for ZeroFieldOperator unit tests.
2359      *
2360      * testZeroFieldOperator takes a set of split file values, a default header, and a
2361      * set of expected values. The expected values array contains the expected values
2362      * after each line.
2363      *
2364      * testZeroFieldOperator is very similar to testSingleFieldOperator, except that
2365      * there is no use of field indices and fewer types of headers. See the latter's
2366      * documentation and the CountOperator unit tests for examples.
2367      */
2368     void testZeroFieldOperator(OperatorClass : ZeroFieldOperator)
2369         (const char[][][] splitFile, string defaultHeader, const char[][] expectedValues)
2370     {
2371         import std.format : format;
2372         import std.array : appender;
2373         import std..string : chomp;
2374         import std.traits : EnumMembers;
2375 
2376         auto numFields = (splitFile[0]).length;
2377 
2378         assert(splitFile.length + 1 == expectedValues.length,
2379                format("[testZeroFieldOperator] Need one more expected value than number of rows. headerSuffix: %s",
2380                       defaultHeader));
2381 
2382         /* printOptions - Not used these tests, but needed for API calls. */
2383         auto printOptions = SummarizerPrintOptions('#', '|');
2384 
2385         /* Missing policy doesn't apply to zero field operators, but need the object for the summarizer. */
2386         auto missingPolicy = new MissingFieldPolicy;
2387 
2388         /* An input header line. */
2389         string[] inputHeaderLine = new string[numFields];
2390         foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string;
2391 
2392         auto customOutputFieldHeader = "custom";
2393 
2394         enum HeaderUsecase {
2395             HeaderLine_DefaultHeader,
2396             HeaderLine_CustomHeader,
2397             NoHeaderLine_DefaultHeader,
2398             NoHeaderLine_CustomHeader,
2399             NoHeaderLine_NoOutputHeader,
2400         }
2401 
2402         string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected)
2403         {
2404             return format("[testZeroFieldOperator] Unexpected header. Operator: %s; Usecase: %s;  Actual: '%s';  Expected: '%s'",
2405                           op.name, hc, actual, expected);
2406         }
2407 
2408         string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex,
2409                                   const char[] actual, const char[] expected)
2410         {
2411             return format("[testZeroFieldOperator] Unexpected value. Operator: %s; Usecase: %s;  RowIndex: %d\n    Actual: '%s';  Expected: '%s'",
2412                           op.name, hc, rowIndex, actual, expected);
2413         }
2414 
2415         /* Run the logic for each header use case. */
2416         foreach (hc; EnumMembers!HeaderUsecase)
2417         {
2418             bool hasInputHeader = (
2419                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2420                 hc == HeaderUsecase.HeaderLine_CustomHeader
2421                 );
2422             bool hasOutputHeader = (
2423                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2424                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2425                 hc == HeaderUsecase.NoHeaderLine_DefaultHeader ||
2426                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2427                 );
2428             bool hasCustomHeader = (
2429                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2430                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2431                 );
2432 
2433             if (hasCustomHeader) assert(hasOutputHeader);
2434 
2435             auto op = new OperatorClass();
2436 
2437             if (hasCustomHeader)
2438             {
2439                 if (!op.allowCustomHeader) continue;   // Custom header not support by this operator
2440                 op.setCustomHeader(customOutputFieldHeader);
2441             }
2442 
2443             Operator[] operatorArray;
2444             operatorArray ~= op;
2445 
2446             auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy);
2447             summarizer.setOperators(inputRangeObject(operatorArray));
2448             if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine);
2449 
2450             if (hasOutputHeader)
2451             {
2452                 /* Write the header line. Note that this is a one-field header, */
2453                 auto headerLineOutput = appender!(char[])();
2454                 summarizer.writeSummaryHeader(headerLineOutput, printOptions);
2455 
2456                 /* Test that the header was generated correctly.
2457                  *
2458                  * Note: Because the output is generated by a Summarizer, it will have a
2459                  * trailing newline. Use chomp to trim it.
2460                  */
2461                 final switch (hc)
2462                 {
2463                 case HeaderUsecase.HeaderLine_DefaultHeader:
2464                 case HeaderUsecase.NoHeaderLine_DefaultHeader:
2465                     assert(headerLineOutput.data.chomp == defaultHeader,
2466                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2467                                                defaultHeader));
2468                     break;
2469                 case HeaderUsecase.HeaderLine_CustomHeader:
2470                 case HeaderUsecase.NoHeaderLine_CustomHeader:
2471                     assert(headerLineOutput.data.chomp == customOutputFieldHeader,
2472                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2473                                                customOutputFieldHeader));
2474                     break;
2475                 case HeaderUsecase.NoHeaderLine_NoOutputHeader:
2476                     break;
2477                 }
2478 
2479             }
2480 
2481             /* For each line, process the line, generate the output, and test that the
2482              * value is correct. Start with the empty file case.
2483              */
2484             foreach (i, const char[] expected; expectedValues)
2485             {
2486                 if (i > 0) summarizer.processNextLine(splitFile[i - 1]);
2487                 auto summaryLineOutput = appender!(char[])();
2488                 summarizer.writeSummaryBody(summaryLineOutput, printOptions);
2489                 assert(summaryLineOutput.data.chomp == expected,
2490                        valueAssertMessage(operatorArray[0], hc, i,
2491                                           summaryLineOutput.data.chomp, expectedValues[i]));
2492             }
2493         }
2494     }
2495 }
2496 
2497 /* Specific operators.
2498  *
2499  * Notes:
2500  * - The 'Calculator' inner classes are 'static'. This means inner class instances do not
2501  *   keep a reference to the context of the outer class. In exchange, Calculator instances
2502  *   need to hold all needed state, typically the field index they are summarizing.
2503  */
2504 
2505 /** CountOperator counts the number of occurrences of each unique key, or the number of
2506  * input lines if there is no unique key.
2507  *
2508  * CountOperator differs from most other operators in that it doesn't summarize a specific
2509  * field on the line. Instead it is summarizing a property of the unique key itself. For
2510  * this reason it doesn't derive from SingleFieldOperator.
2511  */
2512 final class CountOperator : ZeroFieldOperator
2513 {
2514     this()
2515     {
2516         super("count");
2517     }
2518 
2519     final override ZeroFieldCalculator makeCalculator()
2520     {
2521         return new CountCalculator();
2522     }
2523 
2524     static final class CountCalculator : ZeroFieldCalculator
2525     {
2526         private size_t _count = 0;
2527 
2528         final override void processNextEntry()
2529         {
2530             _count++;
2531         }
2532 
2533         final override string calculate(const ref SummarizerPrintOptions printOptions)
2534         {
2535             return printOptions.formatNumber(_count);
2536         }
2537     }
2538 }
2539 
2540 unittest // CountOperator
2541 {
2542     auto col1File = [["10"], ["9.5"], ["11"]];
2543     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
2544     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
2545 
2546     testZeroFieldOperator!CountOperator(col1File, "count", ["0", "1", "2", "3"]);
2547     testZeroFieldOperator!CountOperator(col2File, "count", ["0", "1", "2", "3"]);
2548     testZeroFieldOperator!CountOperator(col3File, "count", ["0", "1", "2", "3"]);
2549 }
2550 
2551 /** RetainOperator retains the first occurrence of a field, without changing the header.
2552  *
2553  * RetainOperator is intended for fields where the value is expected to be the same for
2554  * all occurrences of the unique key, and the goal is to pass the value through unchanged.
2555  * It is like FirstOperator, except that the original header is preserved. The original
2556  * header preservation is setup in the call to the SingleFieldOperation constructor.
2557  *
2558  * Notes:
2559  * - An option to signal an error if multiple values are encountered might be useful.
2560  */
2561 final class RetainOperator : SingleFieldOperator
2562 {
2563     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2564     {
2565         super("retain", fieldIndex, missingPolicy, No.useHeaderSuffix, No.allowCustomHeader);
2566     }
2567 
2568     final override SingleFieldCalculator makeCalculator()
2569     {
2570         return new RetainCalculator(fieldIndex);
2571     }
2572 
2573     final class RetainCalculator : SingleFieldCalculator
2574     {
2575         private bool _done = false;
2576         private string _value = "";
2577 
2578         this(size_t fieldIndex)
2579         {
2580             super(fieldIndex);
2581         }
2582 
2583         final override RetainOperator getOperator()
2584         {
2585             return this.outer;
2586         }
2587 
2588         final override void processNextField(const char[] nextField)
2589         {
2590             if (!_done)
2591             {
2592                 _value = nextField.to!string;
2593                 _done = true;
2594             }
2595         }
2596 
2597         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2598        {
2599             return _value;
2600         }
2601     }
2602 }
2603 
2604 unittest // RetainOperator
2605 {
2606     auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]];
2607     auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]];
2608     auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]];
2609 
2610     testSingleFieldOperator!RetainOperator(col1File, 0, "", ["", "r1c1", "r1c1", "r1c1"]);
2611     testSingleFieldOperator!RetainOperator(col2File, 0, "", ["", "r1c1", "r1c1", "r1c1"]);
2612     testSingleFieldOperator!RetainOperator(col2File, 1, "", ["", "r1c2", "r1c2", "r1c2"]);
2613     testSingleFieldOperator!RetainOperator(col3File, 0, "", ["", "r1c1", "r1c1", "r1c1"]);
2614     testSingleFieldOperator!RetainOperator(col3File, 1, "", ["", "r1c2", "r1c2", "r1c2"]);
2615     testSingleFieldOperator!RetainOperator(col3File, 2, "", ["", "r1c3", "r1c3", "r1c3"]);
2616 
2617     auto col1misFile = [[""], ["r2c1"], ["r3c1"]];
2618     testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "", "r2c1", "r2c1"],
2619                                            new MissingFieldPolicy(true, ""));  // Exclude missing
2620     testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "NA", "NA", "NA"],
2621                                            new MissingFieldPolicy(false, "NA"));  // Replace missing
2622 }
2623 
2624 /** FirstOperator outputs the first value found for the field.
2625  */
2626 final class FirstOperator : SingleFieldOperator
2627 {
2628     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2629     {
2630         super("first", fieldIndex, missingPolicy);
2631     }
2632 
2633     final override SingleFieldCalculator makeCalculator()
2634     {
2635         return new FirstCalculator(fieldIndex);
2636     }
2637 
2638     final class FirstCalculator : SingleFieldCalculator
2639     {
2640         private bool _done = false;
2641         private string _value = "";
2642 
2643         this(size_t fieldIndex)
2644         {
2645             super(fieldIndex);
2646         }
2647 
2648         final override FirstOperator getOperator()
2649         {
2650             return this.outer;
2651         }
2652 
2653         final override void processNextField(const char[] nextField)
2654         {
2655             if (!_done)
2656             {
2657                 _value = nextField.to!string;
2658                 _done = true;
2659             }
2660         }
2661 
2662         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2663         {
2664             return _value;
2665         }
2666     }
2667 }
2668 
2669 unittest // FirstOperator
2670 {
2671     auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]];
2672     auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]];
2673     auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]];
2674 
2675     testSingleFieldOperator!FirstOperator(col1File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]);
2676     testSingleFieldOperator!FirstOperator(col2File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]);
2677     testSingleFieldOperator!FirstOperator(col2File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]);
2678     testSingleFieldOperator!FirstOperator(col3File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]);
2679     testSingleFieldOperator!FirstOperator(col3File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]);
2680     testSingleFieldOperator!FirstOperator(col3File, 2, "first", ["", "r1c3", "r1c3", "r1c3"]);
2681 
2682     auto col1misFile = [[""], ["r2c1"], ["r3c1"]];
2683     testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "", "r2c1", "r2c1"],
2684                                           new MissingFieldPolicy(true, ""));  // Exclude missing
2685     testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "NA", "NA", "NA"],
2686                                           new MissingFieldPolicy(false, "NA"));  // Replace missing
2687 }
2688 
2689 /** LastOperator outputs the last value found for the field.
2690  */
2691 final class LastOperator : SingleFieldOperator
2692 {
2693     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2694     {
2695         super("last", fieldIndex, missingPolicy);
2696     }
2697 
2698     final override SingleFieldCalculator makeCalculator()
2699     {
2700         return new LastCalculator(fieldIndex);
2701     }
2702 
2703     final class LastCalculator : SingleFieldCalculator
2704     {
2705         private string _value = "";
2706 
2707         this(size_t fieldIndex)
2708         {
2709             super(fieldIndex);
2710         }
2711 
2712         final override LastOperator getOperator()
2713         {
2714             return this.outer;
2715         }
2716 
2717         final override void processNextField(const char[] nextField)
2718         {
2719             _value = nextField.to!string;
2720         }
2721 
2722         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2723         {
2724             return _value;
2725         }
2726     }
2727 }
2728 
2729 unittest // LastOperator
2730 {
2731     auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]];
2732     auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]];
2733     auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]];
2734 
2735     testSingleFieldOperator!LastOperator(col1File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]);
2736     testSingleFieldOperator!LastOperator(col2File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]);
2737     testSingleFieldOperator!LastOperator(col2File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]);
2738     testSingleFieldOperator!LastOperator(col3File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]);
2739     testSingleFieldOperator!LastOperator(col3File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]);
2740     testSingleFieldOperator!LastOperator(col3File, 2, "last", ["", "r1c3", "r2c3", "r3c3"]);
2741 
2742     auto col1misFile = [[""], ["r2c1"], ["r3c1"]];
2743     testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "", "r2c1", "r3c1"],
2744                                           new MissingFieldPolicy(true, ""));  // Exclude missing
2745     testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "NA", "r2c1", "r3c1"],
2746                                           new MissingFieldPolicy(false, "NA"));  // Replace missing
2747 }
2748 
2749 /** MinOperator output the minimum value for the field. This is a numeric operator.
2750  */
2751 final class MinOperator : SingleFieldOperator
2752 {
2753     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2754     {
2755         super("min", fieldIndex, missingPolicy);
2756     }
2757 
2758     final override SingleFieldCalculator makeCalculator()
2759     {
2760         return new MinCalculator(fieldIndex);
2761     }
2762 
2763     final class MinCalculator : SingleFieldCalculator
2764     {
2765         private bool _isFirst = true;
2766         private double _value = double.nan;
2767 
2768         this(size_t fieldIndex)
2769         {
2770             super(fieldIndex);
2771         }
2772 
2773         final override MinOperator getOperator()
2774         {
2775             return this.outer;
2776         }
2777 
2778         final override void processNextField(const char[] nextField)
2779         {
2780             double fieldValue = nextField.to!double;
2781             if (_isFirst)
2782             {
2783                 _value = fieldValue;
2784                 _isFirst = false;
2785             }
2786             else if (fieldValue < _value)
2787             {
2788                 _value = fieldValue;
2789             }
2790         }
2791 
2792         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2793         {
2794             return printOptions.formatNumber(_value);
2795         }
2796     }
2797 }
2798 
2799 unittest // MinOperator
2800 {
2801     auto col1File = [["10"], ["9.5"], ["11"]];
2802     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
2803     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
2804 
2805     testSingleFieldOperator!MinOperator(col1File, 0, "min", ["nan", "10", "9.5", "9.5"]);
2806     testSingleFieldOperator!MinOperator(col2File, 0, "min", ["nan", "20", "20", "20"]);
2807     testSingleFieldOperator!MinOperator(col2File, 1, "min", ["nan", "-30", "-30", "-31"]);
2808     testSingleFieldOperator!MinOperator(col3File, 0, "min", ["nan", "9009", "199", "199"]);
2809     testSingleFieldOperator!MinOperator(col3File, 1, "min", ["nan", "9", "0", "0"]);
2810     testSingleFieldOperator!MinOperator(col3File, 2, "min", ["nan", "-4.5", "-4.5", "-4.5"]);
2811 
2812     auto col1misFile = [[""], ["10"], ["-10"]];
2813     testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "nan", "10", "-10"],
2814                                           new MissingFieldPolicy(true, ""));  // Exclude missing
2815     testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "5", "5", "-10"],
2816                                           new MissingFieldPolicy(false, "5"));  // Replace missing
2817 }
2818 
2819 /** MaxOperator output the maximum value for the field. This is a numeric operator.
2820  */
2821 final class MaxOperator : SingleFieldOperator
2822 {
2823     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2824     {
2825         super("max", fieldIndex, missingPolicy);
2826     }
2827 
2828     final override SingleFieldCalculator makeCalculator()
2829     {
2830         return new MaxCalculator(fieldIndex);
2831     }
2832 
2833     final class MaxCalculator : SingleFieldCalculator
2834     {
2835         private bool _isFirst = true;
2836         private double _value = double.nan;
2837 
2838         this(size_t fieldIndex)
2839         {
2840             super(fieldIndex);
2841         }
2842 
2843         final override MaxOperator getOperator()
2844         {
2845             return this.outer;
2846         }
2847 
2848         final override void processNextField(const char[] nextField)
2849         {
2850             double fieldValue = nextField.to!double;
2851             if (_isFirst)
2852             {
2853                 _value = fieldValue;
2854                 _isFirst = false;
2855             }
2856             else if (fieldValue > _value)
2857             {
2858                 _value = fieldValue;
2859             }
2860         }
2861 
2862         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2863         {
2864             return printOptions.formatNumber(_value);
2865         }
2866     }
2867 }
2868 
2869 unittest // MaxOperator
2870 {
2871     auto col1File = [["10"], ["9.5"], ["11"]];
2872     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
2873     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
2874 
2875     testSingleFieldOperator!MaxOperator(col1File, 0, "max", ["nan", "10", "10", "11"]);
2876     testSingleFieldOperator!MaxOperator(col2File, 0, "max", ["nan", "20", "21", "22"]);
2877     testSingleFieldOperator!MaxOperator(col2File, 1, "max", ["nan", "-30", "-29", "-29"]);
2878     testSingleFieldOperator!MaxOperator(col3File, 0, "max", ["nan", "9009", "9009", "9009"]);
2879     testSingleFieldOperator!MaxOperator(col3File, 1, "max", ["nan", "9", "9", "9"]);
2880     testSingleFieldOperator!MaxOperator(col3File, 2, "max", ["nan", "-4.5", "-0.5", "12"]);
2881 
2882     auto col1misFile = [[""], ["-10"], ["10"]];
2883     testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "nan", "-10", "10"],
2884                                           new MissingFieldPolicy(true, ""));  // Exclude missing
2885     testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "5", "5", "10"],
2886                                           new MissingFieldPolicy(false, "5"));  // Replace missing
2887 }
2888 
2889 /** RangeOperator outputs the difference between the minimum and maximum values.
2890  *
2891  * If there is a single value, or all values are the same, the range is zero. This is
2892  * a numeric operator.
2893  */
2894 final class RangeOperator : SingleFieldOperator
2895 {
2896     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2897     {
2898         super("range", fieldIndex, missingPolicy);
2899     }
2900 
2901     final override SingleFieldCalculator makeCalculator()
2902     {
2903         return new RangeCalculator(fieldIndex);
2904     }
2905 
2906     final class RangeCalculator : SingleFieldCalculator
2907     {
2908         private bool _isFirst = true;
2909         private double _minValue = 0.0;
2910         private double _maxValue = 0.0;
2911 
2912         this(size_t fieldIndex)
2913         {
2914             super(fieldIndex);
2915         }
2916 
2917         final override RangeOperator getOperator()
2918         {
2919             return this.outer;
2920         }
2921 
2922         final override void processNextField(const char[] nextField)
2923         {
2924             double fieldValue = nextField.to!double;
2925             if (_isFirst)
2926             {
2927                 _minValue = _maxValue = fieldValue;
2928                 _isFirst = false;
2929             }
2930             else if (fieldValue > _maxValue)
2931             {
2932                 _maxValue = fieldValue;
2933             }
2934             else if (fieldValue < _minValue)
2935             {
2936                 _minValue = fieldValue;
2937             }
2938         }
2939 
2940         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2941         {
2942             return printOptions.formatNumber(_maxValue - _minValue);
2943         }
2944     }
2945 }
2946 
2947 unittest // RangeOperator
2948 {
2949     auto col1File = [["10"], ["9.5"], ["11"]];
2950     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
2951     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
2952 
2953     testSingleFieldOperator!RangeOperator(col1File, 0, "range", ["0", "0", "0.5", "1.5"]);
2954     testSingleFieldOperator!RangeOperator(col2File, 0, "range", ["0", "0", "1", "2"]);
2955     testSingleFieldOperator!RangeOperator(col2File, 1, "range", ["0", "0", "1", "2"]);
2956     testSingleFieldOperator!RangeOperator(col3File, 0, "range", ["0", "0", "8810", "8810"]);
2957     testSingleFieldOperator!RangeOperator(col3File, 1, "range", ["0", "0", "9", "9"]);
2958     testSingleFieldOperator!RangeOperator(col3File, 2, "range", ["0", "0", "4", "16.5"]);
2959 
2960     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]];
2961     testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "0", "0", "0.5", "1.5"],
2962                                           new MissingFieldPolicy(true, ""));  // Exclude missing
2963     testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "4.5", "4.5", "4.5", "5.5"],
2964                                           new MissingFieldPolicy(false, "5.5"));  // Replace missing
2965 }
2966 
2967 /** SumOperator produces the sum of all the values. This is a numeric operator.
2968  */
2969 final class SumOperator : SingleFieldOperator
2970 {
2971     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2972     {
2973         super("sum", fieldIndex, missingPolicy);
2974     }
2975 
2976     final override SingleFieldCalculator makeCalculator()
2977     {
2978         return new SumCalculator(fieldIndex);
2979     }
2980 
2981     final class SumCalculator : SingleFieldCalculator
2982     {
2983         private double _total = 0.0;
2984 
2985         this(size_t fieldIndex)
2986         {
2987             super(fieldIndex);
2988         }
2989 
2990         final override SumOperator getOperator()
2991         {
2992             return this.outer;
2993         }
2994 
2995         final override void processNextField(const char[] nextField)
2996         {
2997             _total += nextField.to!double;
2998         }
2999 
3000         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3001         {
3002             return printOptions.formatNumber(_total);
3003         }
3004     }
3005 }
3006 
3007 unittest // SumOperator
3008 {
3009     auto col1File = [["10"], ["9.5"], ["11"]];
3010     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3011     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
3012 
3013     testSingleFieldOperator!SumOperator(col1File, 0, "sum", ["0", "10", "19.5", "30.5"]);
3014     testSingleFieldOperator!SumOperator(col2File, 0, "sum", ["0", "20", "41", "63"]);
3015     testSingleFieldOperator!SumOperator(col2File, 1, "sum", ["0", "-30", "-59", "-90"]);
3016     testSingleFieldOperator!SumOperator(col3File, 0, "sum", ["0", "9009", "9208", "12211"]);
3017     testSingleFieldOperator!SumOperator(col3File, 1, "sum", ["0", "9", "9", "9.2"]);
3018     testSingleFieldOperator!SumOperator(col3File, 2, "sum", ["0", "-4.5", "-5", "7"]);
3019 
3020     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]];
3021     testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "0", "10", "10", "19.5", "30.5"],
3022                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3023     testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "1.5", "11.5", "13", "22.5", "33.5"],
3024                                           new MissingFieldPolicy(false, "1.5"));  // Replace missing
3025 }
3026 
3027 /** MeanOperator produces the mean (average) of all the values. This is a numeric operator.
3028  */
3029 final class MeanOperator : SingleFieldOperator
3030 {
3031     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3032     {
3033         super("mean", fieldIndex, missingPolicy);
3034     }
3035 
3036     final override SingleFieldCalculator makeCalculator()
3037     {
3038         return new MeanCalculator(fieldIndex);
3039     }
3040 
3041     final class MeanCalculator : SingleFieldCalculator
3042     {
3043         private double _total = 0.0;
3044         private size_t _count = 0;
3045 
3046         this(size_t fieldIndex)
3047         {
3048             super(fieldIndex);
3049         }
3050 
3051         final override MeanOperator getOperator()
3052         {
3053             return this.outer;
3054         }
3055 
3056         final override void processNextField(const char[] nextField)
3057         {
3058             _total += nextField.to!double;
3059             _count++;
3060         }
3061 
3062         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3063         {
3064             return printOptions.formatNumber(
3065                 (_count > 0) ? (_total / _count.to!double) : double.nan);
3066         }
3067     }
3068 }
3069 
3070 unittest // MeanOperator
3071 {
3072     auto col1File = [["10"], ["9.5"], ["7.5"]];
3073     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3074     auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]];
3075 
3076     testSingleFieldOperator!MeanOperator(col1File, 0, "mean", ["nan", "10", "9.75", "9"]);
3077     testSingleFieldOperator!MeanOperator(col2File, 0, "mean", ["nan", "20", "20.5", "21"]);
3078     testSingleFieldOperator!MeanOperator(col2File, 1, "mean", ["nan", "-30", "-29.5", "-30"]);
3079     testSingleFieldOperator!MeanOperator(col3File, 0, "mean", ["nan", "9009", "4509", "4509"]);
3080     testSingleFieldOperator!MeanOperator(col3File, 1, "mean", ["nan", "9", "4.5", "2"]);
3081     testSingleFieldOperator!MeanOperator(col3File, 2, "mean", ["nan", "-4.5", "-3", "2"]);
3082 
3083     auto col1misFile = [[""], ["6"], [""], ["14"], ["40"]];
3084     testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "nan", "6", "6", "10", "20"],
3085                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3086     testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "0", "3", "2", "5", "12"],
3087                                           new MissingFieldPolicy(false, "0"));  // Replace missing
3088 }
3089 
3090 /** MedianOperator produces the median of all the values. This is a numeric operator.
3091  *
3092  * All the field values are stored in memory as part of this calculation. This is
3093  * handled by unique key value lists.
3094  */
3095 final class MedianOperator : SingleFieldOperator
3096 {
3097     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3098     {
3099         super("median", fieldIndex, missingPolicy);
3100         setSaveFieldValuesNumeric();
3101     }
3102 
3103     final override SingleFieldCalculator makeCalculator()
3104     {
3105         return new MedianCalculator(fieldIndex);
3106     }
3107 
3108     final class MedianCalculator : SingleFieldCalculator
3109     {
3110         this(size_t fieldIndex)
3111         {
3112             super(fieldIndex);
3113         }
3114 
3115         final override MedianOperator getOperator()
3116         {
3117             return this.outer;
3118         }
3119 
3120         /* Work is done by saving the field values. */
3121         final override void processNextField(const char[] nextField)
3122         { }
3123 
3124         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3125         {
3126             return printOptions.formatNumber(valuesLists.numericValuesMedian(fieldIndex));
3127         }
3128     }
3129 }
3130 
3131 unittest // MedianOperator
3132 {
3133     auto col1File = [["10"], ["9.5"], ["7.5"]];
3134     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3135     auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]];
3136 
3137     testSingleFieldOperator!MedianOperator(col1File, 0, "median", ["nan", "10", "9.75", "9.5"]);
3138     testSingleFieldOperator!MedianOperator(col2File, 0, "median", ["nan", "20", "20.5", "21"]);
3139     testSingleFieldOperator!MedianOperator(col2File, 1, "median", ["nan", "-30", "-29.5", "-30"]);
3140     testSingleFieldOperator!MedianOperator(col3File, 0, "median", ["nan", "9009", "4509", "4509"]);
3141     testSingleFieldOperator!MedianOperator(col3File, 1, "median", ["nan", "9", "4.5", "0"]);
3142     testSingleFieldOperator!MedianOperator(col3File, 2, "median", ["nan", "-4.5", "-3", "-1.5"]);
3143 
3144     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]];
3145     testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "nan", "10", "10", "9.75", "9.5"],
3146                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3147     testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "0", "5", "0", "4.75", "7.5"],
3148                                           new MissingFieldPolicy(false, "0"));  // Replace missing
3149 }
3150 
3151 /** QuantileOperator produces the value representing the data at a cummulative probability.
3152  * This is a numeric operation.
3153  *
3154  * As an example, quantiles might be produced for the 0.25, 0.5, and 0.75 probabilities
3155  * (alternately, the 25th, 50th, and 75th percentile ranks, the 50th percentile being the
3156  * median). Data is sorted is ascending order. This operator takes one percentile, but it
3157  * is common to generate multiple quantile ranks for the same field when summarizing.
3158  *
3159  * All the field's values are stored in memory as part of this calculation. This is
3160  * handled by unique key value lists.
3161  */
3162 final class QuantileOperator : SingleFieldOperator
3163 {
3164     private double _prob;
3165 
3166     this(size_t fieldIndex, MissingFieldPolicy missingPolicy, double probability)
3167     {
3168         assert(0.0 <= probability && probability <= 1.0);
3169         import std.format : format;
3170 
3171         string header = (probability == 0.0) ? "pct0" : format("pct%02g", probability * 100.0);
3172         super(header, fieldIndex, missingPolicy);
3173         _prob = probability;
3174         setSaveFieldValuesNumeric();
3175     }
3176 
3177     final override SingleFieldCalculator makeCalculator()
3178     {
3179         return new QuantileCalculator(fieldIndex);
3180     }
3181 
3182     final class QuantileCalculator : SingleFieldCalculator
3183     {
3184         this(size_t fieldIndex)
3185         {
3186             super(fieldIndex);
3187         }
3188 
3189         final override QuantileOperator getOperator()
3190         {
3191             return this.outer;
3192         }
3193 
3194         /* Work is done by saving the field values. */
3195         final override void processNextField(const char[] nextField)
3196         { }
3197 
3198         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3199         {
3200             import tsv_utils.common.numerics : quantile;
3201             return printOptions.formatNumber(
3202                 quantile(this.outer._prob, valuesLists.numericValuesSorted(fieldIndex)));
3203         }
3204     }
3205 }
3206 
3207 unittest // QuantileOperator
3208 {
3209     auto col1File = [["10"], ["9.5"], ["7.5"]];
3210     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3211     auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]];
3212 
3213     auto defaultMissing = new MissingFieldPolicy;
3214 
3215     /* Same as the median tests. */
3216     testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct50", ["nan", "10", "9.75", "9.5"], defaultMissing, 0.50);
3217     testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct50", ["nan", "20", "20.5", "21"], defaultMissing, 0.50);
3218     testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct50", ["nan", "-30", "-29.5", "-30"], defaultMissing, 0.50);
3219     testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct50", ["nan", "9009", "4509", "4509"], defaultMissing, 0.50);
3220     testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct50", ["nan", "9", "4.5", "0"], defaultMissing, 0.50);
3221     testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct50", ["nan", "-4.5", "-3", "-1.5"], defaultMissing, 0.50);
3222 
3223     /* The extremes (0, 1), are min and max. */
3224     testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct0", ["nan", "10", "9.5", "7.5"], defaultMissing, 0.0);
3225     testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct0", ["nan", "20", "20", "20"], defaultMissing, 0.0);
3226     testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct0", ["nan", "-30", "-30", "-31"], defaultMissing, 0.0);
3227     testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct0", ["nan", "9009", "9", "9"], defaultMissing, 0.0);
3228     testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct0", ["nan", "9", "0", "-3"], defaultMissing, 0.0);
3229     testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct0", ["nan", "-4.5", "-4.5", "-4.5"], defaultMissing, 0.0);
3230 
3231     testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct100", ["nan", "10", "10", "10"], defaultMissing, 1.0);
3232     testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct100", ["nan", "20", "21", "22"], defaultMissing, 1.0);
3233     testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct100", ["nan", "-30", "-29", "-29"], defaultMissing, 1.0);
3234     testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct100", ["nan", "9009", "9009", "9009"], defaultMissing, 1.0);
3235     testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct100", ["nan", "9", "9", "9"], defaultMissing, 1.0);
3236     testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct100", ["nan", "-4.5", "-1.5", "12"], defaultMissing, 1.0);
3237 
3238     /* For missing policies, re-use the median tests. */
3239     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]];
3240     testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "nan", "10", "10", "9.75", "9.5"],
3241                                                  new MissingFieldPolicy(true, ""), 0.5);  // Exclude missing
3242     testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "0", "5", "0", "4.75", "7.5"],
3243                                                  new MissingFieldPolicy(false, "0"), 0.5);  // Replace missing
3244 }
3245 
3246 /** MadOperator produces the median absolute deviation from the median. This is a numeric
3247  * operation.
3248  *
3249  * The result is the raw MAD value, without a normalization applied.
3250  *
3251  * All the field values are stored in memory as part of this calculation. This is
3252  * handled by unique key value lists.
3253  */
3254 final class MadOperator : SingleFieldOperator
3255 {
3256     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3257     {
3258         super("mad", fieldIndex, missingPolicy);
3259         setSaveFieldValuesNumeric();
3260     }
3261 
3262     final override SingleFieldCalculator makeCalculator()
3263     {
3264         return new MadCalculator(fieldIndex);
3265     }
3266 
3267     final class MadCalculator : SingleFieldCalculator
3268     {
3269         this(size_t fieldIndex)
3270         {
3271             super(fieldIndex);
3272         }
3273 
3274         final override MadOperator getOperator()
3275         {
3276             return this.outer;
3277         }
3278 
3279         /* Work is done by saving the field values. */
3280         final override void processNextField(const char[] nextField)
3281         { }
3282 
3283         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3284         {
3285             import std.math : abs;
3286             import tsv_utils.common.numerics : rangeMedian;
3287 
3288             auto median = valuesLists.numericValuesMedian(fieldIndex);
3289             auto values = valuesLists.numericValues(fieldIndex);
3290             auto medianDevs = new double[values.length];
3291             foreach (size_t i, double v; values)
3292                 medianDevs[i] = abs(v - median);
3293 
3294             return printOptions.formatNumber(medianDevs.rangeMedian);
3295         }
3296     }
3297 }
3298 
3299 unittest // MadOperator
3300 {
3301     auto col1File = [["10"], ["15"], ["20"], ["25"], ["30"]];
3302     auto col2File = [["2", "50"], ["2", "51"], ["2", "52"]];
3303     auto col3File = [["16", "8", "-4"], ["8", "8", "-2"], ["8", "16", "0"]];
3304 
3305     testSingleFieldOperator!MadOperator(col1File, 0, "mad", ["nan", "0", "2.5", "5", "5", "5"]);
3306     testSingleFieldOperator!MadOperator(col2File, 0, "mad", ["nan", "0", "0", "0"]);
3307     testSingleFieldOperator!MadOperator(col2File, 1, "mad", ["nan", "0", "0.5", "1"]);
3308     testSingleFieldOperator!MadOperator(col3File, 0, "mad", ["nan", "0", "4", "0"]);
3309     testSingleFieldOperator!MadOperator(col3File, 1, "mad", ["nan", "0", "0", "0"]);
3310     testSingleFieldOperator!MadOperator(col3File, 2, "mad", ["nan", "0", "1", "2"]);
3311 
3312     auto col1misFile = [[""], ["16"], [""], ["32"], ["-4"]];
3313     testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "nan", "0", "0", "8", "16"],
3314                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3315     testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "0", "8", "0", "8", "4"],
3316                                           new MissingFieldPolicy(false, "0"));  // Replace missing
3317 }
3318 
3319 /** Generates the variance of the fields values. This is a numeric operator.
3320  */
3321 final class VarianceOperator : SingleFieldOperator
3322 {
3323     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3324     {
3325         super("var", fieldIndex, missingPolicy);
3326     }
3327 
3328     final override SingleFieldCalculator makeCalculator()
3329     {
3330         return new VarianceCalculator(fieldIndex);
3331     }
3332 
3333     final class VarianceCalculator : SingleFieldCalculator
3334     {
3335         private double _count = 0.0;
3336         private double _mean = 0.0;
3337         private double _m2 = 0.0;     // Sum of squares of differences from current mean
3338 
3339         this(size_t fieldIndex)
3340         {
3341             super(fieldIndex);
3342         }
3343 
3344         final override VarianceOperator getOperator()
3345         {
3346             return this.outer;
3347         }
3348 
3349         final override void processNextField(const char[] nextField)
3350         {
3351             _count += 1.0;
3352             double fieldValue = nextField.to!double;
3353             double delta = fieldValue - _mean;
3354             _mean += delta / _count;
3355             _m2 += delta * (fieldValue - _mean);
3356         }
3357 
3358         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3359         {
3360             return printOptions.formatNumber(
3361                 (_count >= 2.0) ? (_m2 / (_count - 1.0)) : double.nan);
3362         }
3363     }
3364 }
3365 
3366 unittest // VarianceOperator
3367 {
3368     auto col1File = [["5"], ["10"], ["15"]];
3369     auto col2File = [["-5", "-5"], ["-10", "0"], ["-15", "5"]];
3370     auto col3File = [["1", "2", "100"], ["2", "3", "100"], ["3", "4", "103"]];
3371 
3372     testSingleFieldOperator!VarianceOperator(col1File, 0, "var", ["nan", "nan", "12.5", "25"]);
3373     testSingleFieldOperator!VarianceOperator(col2File, 0, "var", ["nan", "nan", "12.5", "25"]);
3374     testSingleFieldOperator!VarianceOperator(col2File, 1, "var", ["nan", "nan", "12.5", "25"]);
3375     testSingleFieldOperator!VarianceOperator(col3File, 0, "var", ["nan", "nan", "0.5", "1"]);
3376     testSingleFieldOperator!VarianceOperator(col3File, 1, "var", ["nan", "nan", "0.5", "1"]);
3377     testSingleFieldOperator!VarianceOperator(col3File, 2, "var", ["nan", "nan", "0", "3"]);
3378 
3379     auto col1misFile = [["5"], ["10"], [""]];
3380     testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "12.5"],
3381                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3382     testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "25"],
3383                                           new MissingFieldPolicy(false, "15"));  // Replace missing
3384 }
3385 
3386 /** Generates the standard deviation of the fields values. This is a numeric operator.
3387  */
3388 final class StDevOperator : SingleFieldOperator
3389 {
3390     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3391     {
3392         super("stdev", fieldIndex, missingPolicy);
3393     }
3394 
3395     final override SingleFieldCalculator makeCalculator()
3396     {
3397         return new StDevCalculator(fieldIndex);
3398     }
3399 
3400     final class StDevCalculator : SingleFieldCalculator
3401     {
3402         private double _count = 0.0;
3403         private double _mean = 0.0;
3404         private double _m2 = 0.0;     // Sum of squares of differences from current mean
3405 
3406         this(size_t fieldIndex)
3407         {
3408             super(fieldIndex);
3409         }
3410 
3411         final override StDevOperator getOperator()
3412         {
3413             return this.outer;
3414         }
3415 
3416         final override void processNextField(const char[] nextField)
3417         {
3418             _count += 1.0;
3419             double fieldValue = nextField.to!double;
3420             double delta = fieldValue - _mean;
3421             _mean += delta / _count;
3422             _m2 += delta * (fieldValue - _mean);
3423         }
3424 
3425         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3426         {
3427             import std.math : sqrt;
3428             return printOptions.formatNumber(
3429                 (_count >= 2.0) ? (_m2 / (_count - 1.0)).sqrt : double.nan);
3430         }
3431     }
3432 }
3433 
3434 /* StDevOperator unit tests - These would be improved with a tolerance option.
3435  */
3436 unittest
3437 {
3438     auto col1File = [["1"], ["4"], ["7"]];
3439     auto col2File = [["3", "3"], ["3", "9"], ["7", "15"]];
3440     auto col3File = [["11", "10", "10"], ["24", "22", "25"], ["37", "34", "40"]];
3441 
3442     testSingleFieldOperator!StDevOperator(col1File, 0, "stdev", ["nan", "nan", "2.12132034356", "3"]);
3443     testSingleFieldOperator!StDevOperator(col2File, 0, "stdev", ["nan", "nan", "0", "2.30940107676"]);
3444     testSingleFieldOperator!StDevOperator(col2File, 1, "stdev", ["nan", "nan", "4.24264068712", "6"]);
3445     testSingleFieldOperator!StDevOperator(col3File, 0, "stdev", ["nan", "nan", "9.19238815543", "13"]);
3446     testSingleFieldOperator!StDevOperator(col3File, 1, "stdev", ["nan", "nan", "8.48528137424", "12"]);
3447     testSingleFieldOperator!StDevOperator(col3File, 2, "stdev", ["nan", "nan", "10.6066017178", "15"]);
3448 
3449     auto col1misFile = [["1"], ["4"], [""]];
3450     testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "2.12132034356"],
3451                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3452     testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "3"],
3453                                           new MissingFieldPolicy(false, "7"));  // Replace missing
3454 }
3455 
3456 /** UniqueCountOperator generates the number of unique values. Unique values are
3457  * based on exact text match calculation, not a numeric comparison.
3458  *
3459  * All the unique field values are stored in memory as part of this calculation.
3460  */
3461 final class UniqueCountOperator : SingleFieldOperator
3462 {
3463     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3464     {
3465         super("unique_count", fieldIndex, missingPolicy);
3466     }
3467 
3468     final override SingleFieldCalculator makeCalculator()
3469     {
3470         return new UniqueCountCalculator(fieldIndex);
3471     }
3472 
3473     final class UniqueCountCalculator : SingleFieldCalculator
3474     {
3475         private bool[string] _values;
3476 
3477         this(size_t fieldIndex)
3478         {
3479             super(fieldIndex);
3480         }
3481 
3482         final override UniqueCountOperator getOperator()
3483         {
3484             return this.outer;
3485         }
3486 
3487         final override void processNextField(const char[] nextField)
3488         {
3489             if (nextField !in _values) _values[nextField.to!string] = true;
3490         }
3491 
3492         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3493         {
3494             return printOptions.formatNumber(_values.length);
3495         }
3496     }
3497 }
3498 
3499 unittest // UniqueCount
3500 {
3501     auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]];
3502     auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]];
3503     auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]];
3504 
3505     testSingleFieldOperator!UniqueCountOperator(col1File, 0, "unique_count", ["0", "1", "2", "3", "3", "3", "3", "3", "4"]);
3506     testSingleFieldOperator!UniqueCountOperator(col2File, 0, "unique_count", ["0", "1", "2", "2"]);
3507     testSingleFieldOperator!UniqueCountOperator(col2File, 1, "unique_count", ["0", "1", "1", "2"]);
3508     testSingleFieldOperator!UniqueCountOperator(col3File, 0, "unique_count", ["0", "1", "2", "3"]);
3509     testSingleFieldOperator!UniqueCountOperator(col3File, 1, "unique_count", ["0", "1", "2", "2"]);
3510     testSingleFieldOperator!UniqueCountOperator(col3File, 2, "unique_count", ["0", "1", "2", "3"]);
3511 
3512     auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]];
3513     testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "0", "1", "1", "2", "3", "3", "3", "3", "3", "4"],
3514                                                 new MissingFieldPolicy(true, ""));  // Exclude missing
3515 
3516 
3517     testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "1", "2", "2", "3", "4", "4", "4", "4", "4", "5"],
3518                                                 new MissingFieldPolicy(false, "XYZ"));  // Replace missing
3519 }
3520 
3521 /** MissingCountOperator generates the number of missing values. This overrides
3522  * the global missingFieldsPolicy.
3523  */
3524 final class MissingCountOperator : SingleFieldOperator
3525 {
3526     private MissingFieldPolicy _globalMissingPolicy;
3527 
3528     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3529     {
3530         _globalMissingPolicy = missingPolicy;
3531         super("missing_count", fieldIndex, new MissingFieldPolicy(false, ""));
3532     }
3533 
3534     final override SingleFieldCalculator makeCalculator()
3535     {
3536         return new MissingCountCalculator(fieldIndex);
3537     }
3538 
3539     final class MissingCountCalculator : SingleFieldCalculator
3540     {
3541         private size_t _missingCount = 0;
3542 
3543         this(size_t fieldIndex)
3544         {
3545             super(fieldIndex);
3546         }
3547 
3548         final override MissingCountOperator getOperator()
3549         {
3550             return this.outer;
3551         }
3552 
3553         final override void processNextField(const char[] nextField)
3554         {
3555             if (this.outer._globalMissingPolicy.isMissingField(nextField)) _missingCount++;
3556         }
3557 
3558         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3559         {
3560             return printOptions.formatNumber(_missingCount);
3561         }
3562     }
3563 }
3564 
3565 unittest // MissingCount
3566 {
3567     auto col1File = [["a"], ["b"], [""], [" "], [""]];
3568     auto col2File = [["abc", ""], ["", ""], ["def", ""]];
3569     auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]];
3570 
3571     testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"]);
3572     testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"]);
3573     testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"]);
3574     testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"]);
3575     testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"]);
3576     testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"]);
3577 
3578     auto excludeMissing = new MissingFieldPolicy(true, "");
3579     auto replaceMissing = new MissingFieldPolicy(false, "X");
3580 
3581     testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], excludeMissing);
3582     testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], excludeMissing);
3583     testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], excludeMissing);
3584     testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], excludeMissing);
3585     testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], excludeMissing);
3586     testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], excludeMissing);
3587 
3588     testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], replaceMissing);
3589     testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], replaceMissing);
3590     testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], replaceMissing);
3591     testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], replaceMissing);
3592     testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], replaceMissing);
3593     testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], replaceMissing);
3594 }
3595 
3596 /** NotMissingCountOperator generates the number of not-missing values. This overrides
3597  * the global missingFieldsPolicy.
3598  */
3599 final class NotMissingCountOperator : SingleFieldOperator
3600 {
3601     private MissingFieldPolicy _globalMissingPolicy;
3602 
3603     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3604     {
3605         _globalMissingPolicy = missingPolicy;
3606         super("not_missing_count", fieldIndex, new MissingFieldPolicy(false, ""));
3607     }
3608 
3609     final override SingleFieldCalculator makeCalculator()
3610     {
3611         return new NotMissingCountCalculator(fieldIndex);
3612     }
3613 
3614     final class NotMissingCountCalculator : SingleFieldCalculator
3615     {
3616         private size_t _notMissingCount = 0;
3617 
3618         this(size_t fieldIndex)
3619         {
3620             super(fieldIndex);
3621         }
3622 
3623         final override NotMissingCountOperator getOperator()
3624         {
3625             return this.outer;
3626         }
3627 
3628         final override void processNextField(const char[] nextField)
3629         {
3630             if (!this.outer._globalMissingPolicy.isMissingField(nextField)) _notMissingCount++;
3631         }
3632 
3633         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3634         {
3635             return printOptions.formatNumber(_notMissingCount);
3636         }
3637     }
3638 }
3639 
3640 unittest // NotMissingCount
3641 {
3642     auto col1File = [["a"], ["b"], [""], [" "], [""]];
3643     auto col2File = [["abc", ""], ["", ""], ["def", ""]];
3644     auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]];
3645 
3646     testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"]);
3647     testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"]);
3648     testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"]);
3649     testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"]);
3650     testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"]);
3651     testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"]);
3652 
3653     auto excludeMissing = new MissingFieldPolicy(true, "");
3654     auto replaceMissing = new MissingFieldPolicy(false, "X");
3655 
3656     testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], excludeMissing);
3657     testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], excludeMissing);
3658     testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], excludeMissing);
3659     testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], excludeMissing);
3660     testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], excludeMissing);
3661     testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], excludeMissing);
3662 
3663     testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], replaceMissing);
3664     testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], replaceMissing);
3665     testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], replaceMissing);
3666     testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], replaceMissing);
3667     testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], replaceMissing);
3668     testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], replaceMissing);
3669 }
3670 
3671 /** ModeOperator outputs the most frequent value seen. In the event of a tie, the
3672  * first value seen is produced.
3673  *
3674  * All the field values are stored in memory as part of this calculation.
3675  *
3676  */
3677 final class ModeOperator : SingleFieldOperator
3678 {
3679     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3680     {
3681         super("mode", fieldIndex, missingPolicy);
3682     }
3683 
3684     final override SingleFieldCalculator makeCalculator()
3685     {
3686         return new ModeCalculator(fieldIndex);
3687     }
3688 
3689     final class ModeCalculator : SingleFieldCalculator
3690     {
3691         private size_t[string] _valueCounts;
3692         private Appender!(string[]) _uniqueValues;
3693 
3694         this(size_t fieldIndex)
3695         {
3696             super(fieldIndex);
3697         }
3698 
3699         final override ModeOperator getOperator()
3700         {
3701             return this.outer;
3702         }
3703 
3704         final override void processNextField(const char[] nextField)
3705         {
3706             auto countPtr = (nextField in _valueCounts);
3707 
3708             if (countPtr is null)
3709             {
3710                 string value = nextField.to!string;
3711                 _uniqueValues.put(value);
3712                 _valueCounts[value] = 1;
3713             }
3714             else
3715             {
3716                 (*countPtr)++;
3717             }
3718         }
3719 
3720         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3721         {
3722             string modeValue = "";
3723             size_t modeCount = 0;
3724 
3725             foreach (value; _uniqueValues.data)
3726             {
3727                 assert(value in _valueCounts);
3728 
3729                 auto count = _valueCounts[value];
3730 
3731                 if (count > modeCount)
3732                 {
3733                     modeValue = value;
3734                     modeCount = count;
3735                 }
3736             }
3737 
3738             return modeValue;
3739         }
3740     }
3741 }
3742 
3743 unittest // ModeOperator
3744 {
3745     auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]];
3746     auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]];
3747     auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]];
3748 
3749     testSingleFieldOperator!ModeOperator(col1File, 0, "mode", ["", "a", "a", "a", "c", "b", "b", "b"]);
3750     testSingleFieldOperator!ModeOperator(col2File, 0, "mode", ["", "abc", "abc", "def"]);
3751     testSingleFieldOperator!ModeOperator(col2File, 1, "mode", ["", "pqr", "pqr", "pqr"]);
3752     testSingleFieldOperator!ModeOperator(col3File, 0, "mode", ["", "1.0", "1.0", "1.0"]);
3753     testSingleFieldOperator!ModeOperator(col3File, 1, "mode", ["", "1", "1", "a"]);
3754     testSingleFieldOperator!ModeOperator(col3File, 2, "mode", ["", "a", "a", "a"]);
3755 
3756     auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]];
3757     testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "", "a", "a", "a", "a", "c", "b", "b"],
3758                                          new MissingFieldPolicy(true, ""));  // Exclude missing
3759 
3760 
3761     testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "X", "X", "X", "X", "X", "X", "X", "b"],
3762                                          new MissingFieldPolicy(false, "X"));  // Replace missing
3763 }
3764 
3765 /** ModeCountOperator outputs the count of the most frequent value seen.
3766  *
3767  * All the field values are stored in memory as part of this calculation.
3768  *
3769  */
3770 final class ModeCountOperator : SingleFieldOperator
3771 {
3772     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3773     {
3774         super("mode_count", fieldIndex, missingPolicy);
3775     }
3776 
3777     final override SingleFieldCalculator makeCalculator()
3778     {
3779         return new ModeCountCalculator(fieldIndex);
3780     }
3781 
3782     final class ModeCountCalculator : SingleFieldCalculator
3783     {
3784         private size_t[string] _valueCounts;
3785 
3786         this(size_t fieldIndex)
3787         {
3788             super(fieldIndex);
3789         }
3790 
3791         final override ModeCountOperator getOperator()
3792         {
3793             return this.outer;
3794         }
3795 
3796         final override void processNextField(const char[] nextField)
3797         {
3798             auto countPtr = (nextField in _valueCounts);
3799 
3800             if (countPtr is null)
3801             {
3802                 string value = nextField.to!string;
3803                 _valueCounts[value] = 1;
3804             }
3805             else
3806             {
3807                 (*countPtr)++;
3808             }
3809         }
3810 
3811         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3812         {
3813             size_t modeCount = 0;
3814             foreach (count; _valueCounts.byValue) if (count > modeCount) modeCount = count;
3815             return printOptions.formatNumber(modeCount);
3816         }
3817     }
3818 }
3819 
3820 unittest // ModeCountOperator
3821 {
3822     auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]];
3823     auto col2File = [["abc", ""], ["def", ""], ["def", "xyz"]];
3824     auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]];
3825 
3826     testSingleFieldOperator!ModeCountOperator(col1File, 0, "mode_count", ["0", "1", "1", "1", "2", "2", "3", "3"]);
3827     testSingleFieldOperator!ModeCountOperator(col2File, 0, "mode_count", ["0", "1", "1", "2"]);
3828     testSingleFieldOperator!ModeCountOperator(col2File, 1, "mode_count", ["0", "1", "2", "2"]);
3829     testSingleFieldOperator!ModeCountOperator(col3File, 0, "mode_count", ["0", "1", "1", "1"]);
3830     testSingleFieldOperator!ModeCountOperator(col3File, 1, "mode_count", ["0", "1", "1", "2"]);
3831     testSingleFieldOperator!ModeCountOperator(col3File, 2, "mode_count", ["0", "1", "1", "1"]);
3832 
3833     auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]];
3834     testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "0", "1", "1", "1", "1", "2", "2", "3"],
3835                                               new MissingFieldPolicy(true, ""));  // Exclude missing
3836 
3837 
3838     testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "1", "1", "2", "2", "2", "2", "2", "3"],
3839                                               new MissingFieldPolicy(false, "X"));  // Replace missing
3840 }
3841 
3842 /** ValuesOperator outputs each value delimited by an alternate delimiter character.
3843  *
3844  * All the field values are stored in memory as part of this calculation. This is
3845  * handled by unique key value lists.
3846  */
3847 
3848 final class ValuesOperator : SingleFieldOperator
3849 {
3850     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3851     {
3852         super("values", fieldIndex, missingPolicy);
3853         setSaveFieldValuesText();
3854     }
3855 
3856     final override SingleFieldCalculator makeCalculator()
3857     {
3858         return new ValuesCalculator(fieldIndex);
3859     }
3860 
3861     final class ValuesCalculator : SingleFieldCalculator
3862     {
3863         this(size_t fieldIndex)
3864         {
3865             super(fieldIndex);
3866         }
3867 
3868         final override ValuesOperator getOperator()
3869         {
3870             return this.outer;
3871         }
3872 
3873         /* Work is done by saving the field values. */
3874         final override void processNextField(const char[] nextField)
3875         { }
3876 
3877         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3878         {
3879             return valuesLists.textValues(fieldIndex).join(printOptions.valuesDelimiter);
3880         }
3881     }
3882 }
3883 
3884 unittest // ValuesOperator
3885 {
3886     auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]];
3887     auto col2File = [["", "50"], ["", "51"], ["xyz", "52"]];
3888     auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "---"]];
3889 
3890     testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e|", "a||b|cd|e||a"]);
3891     testSingleFieldOperator!ValuesOperator(col2File, 0, "values", ["", "", "|", "||xyz"]);
3892     testSingleFieldOperator!ValuesOperator(col2File, 1, "values", ["", "50", "50|51", "50|51|52"]);
3893     testSingleFieldOperator!ValuesOperator(col3File, 0, "values", ["", "z", "z|y", "z|y|w"]);
3894     testSingleFieldOperator!ValuesOperator(col3File, 1, "values", ["", "a", "a|ab", "a|ab|ba"]);
3895     testSingleFieldOperator!ValuesOperator(col3File, 2, "values", ["", "-", "-|--", "-|--|---"]);
3896 
3897     testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e|a"],
3898                                          new MissingFieldPolicy(true, ""));  // Exclude missing
3899 
3900 
3901     testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e|X", "a|X|b|cd|e|X|a"],
3902                                          new MissingFieldPolicy(false, "X"));  // Replace missing
3903 }
3904 
3905 /** UniqueValuesOperator outputs each unique value delimited by an alternate delimiter
3906  * character. Values are output in the order seen.
3907  *
3908  * All unique field values are stored in memory as part of this calculation.
3909  *
3910  */
3911 final class UniqueValuesOperator : SingleFieldOperator
3912 {
3913     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3914     {
3915         super("unique_values", fieldIndex, missingPolicy);
3916     }
3917 
3918     final override SingleFieldCalculator makeCalculator()
3919     {
3920         return new UniqueValuesCalculator(fieldIndex);
3921     }
3922 
3923     final class UniqueValuesCalculator : SingleFieldCalculator
3924     {
3925         private size_t[string] _valuesHash;
3926         private Appender!(string[]) _uniqueValues;
3927 
3928         this(size_t fieldIndex)
3929         {
3930             super(fieldIndex);
3931         }
3932 
3933         final override UniqueValuesOperator getOperator()
3934         {
3935             return this.outer;
3936         }
3937 
3938         final override void processNextField(const char[] nextField)
3939         {
3940             auto ptr = (nextField in _valuesHash);
3941 
3942             if (ptr is null)
3943             {
3944                 string value = nextField.to!string;
3945                 _uniqueValues.put(value);
3946                 _valuesHash[value] = 1;
3947             }
3948         }
3949 
3950         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3951         {
3952             return _uniqueValues.data.join(printOptions.valuesDelimiter);
3953         }
3954     }
3955 }
3956 
3957 unittest // UniqueValuesOperator
3958 {
3959     auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]];
3960     auto col2File = [["", "50"], ["", "50"], ["xyz", "52"]];
3961     auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "-"]];
3962 
3963     testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e", "a||b|cd|e"]);
3964     testSingleFieldOperator!UniqueValuesOperator(col2File, 0, "unique_values", ["", "", "", "|xyz"]);
3965     testSingleFieldOperator!UniqueValuesOperator(col2File, 1, "unique_values", ["", "50", "50", "50|52"]);
3966     testSingleFieldOperator!UniqueValuesOperator(col3File, 0, "unique_values", ["", "z", "z|y", "z|y|w"]);
3967     testSingleFieldOperator!UniqueValuesOperator(col3File, 1, "unique_values", ["", "a", "a|ab", "a|ab|ba"]);
3968     testSingleFieldOperator!UniqueValuesOperator(col3File, 2, "unique_values", ["", "-", "-|--", "-|--"]);
3969 
3970     testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e"],
3971                                                  new MissingFieldPolicy(true, ""));  // Exclude missing
3972 
3973 
3974     testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e", "a|X|b|cd|e"],
3975                                                  new MissingFieldPolicy(false, "X"));  // Replace missing
3976 }