1 /**
2 Command line tool that reads TSV files and summarizes field values associated with
3 equivalent keys.
4 
5 Copyright (c) 2016-2018, eBay Software Foundation
6 Initially written by Jon Degenhardt
7 
8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt)
9 */
10 module tsv_summarize;
11 
12 import std.algorithm : all, any, canFind, each, find, findSplit, map, joiner, splitter;
13 import std.array : join;
14 import std.conv : to;
15 import std.format : format;
16 import std.range;
17 import std.stdio;
18 import std.typecons : tuple;
19 import std.container : DList;
20 
21 version(unittest)
22 {
23     // When running unit tests, use main from -main compiler switch.
24 }
25 else
26 {
27     int main(string[] cmdArgs)
28     {
29         /* When running in DMD code coverage mode, turn on report merging. */
30         version(D_Coverage) version(DigitalMars)
31         {
32             import core.runtime : dmd_coverSetMerge;
33             dmd_coverSetMerge(true);
34         }
35 
36         TsvSummarizeOptions cmdopt;
37         auto r = cmdopt.processArgs(cmdArgs);
38         if (!r[0]) return r[1];
39         version(LDC_Profile)
40         {
41             import ldc.profile : resetAll;
42             resetAll();
43         }
44         try tsvSummarize(cmdopt, cmdArgs[1..$]);
45         catch (Exception exc)
46         {
47             stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg);
48             return 1;
49         }
50         return 0;
51     }
52 }
53 
54 auto helpTextVerbose = q"EOS
55 Synopsis: tsv-summarize [options] file [file...]
56 
57 tsv-summarize reads tabular data files (tab-separated by default), tracks
58 field values for each unique key, and runs summarization algorithms. Consider
59 the file data.tsv:
60 
61    make    color   time
62    ford    blue    131
63    chevy   green   124
64    ford    red     128
65    bmw     black   118
66    bmw     black   126
67    ford    blue    122
68 
69 The min and average times for each make is generated by the command:
70 
71    $ tsv-summarize --header --group-by 1 --min 3 --mean 3 data.tsv
72 
73 This produces:
74 
75    make   time_min time_mean
76    ford   122      127
77    chevy  124      124
78    bmw    118      122
79 
80 Using '--group 1,2' will group by both 'make' and 'color'. Omitting the
81 '--group-by' entirely summarizes fields for full file.
82 
83 The program tries to generate useful headers, but custom headers can be
84 specified. Example (using -g and -H shortcuts for --header and --group-by):
85 
86    $ tsv-summarize -H -g 1 --min 3:fastest --mean 3:average data.tsv
87 
88 Most operators take custom headers in a similarly way, generally following:
89 
90   --<operator-name> FIELD[:header]
91 
92 Operators can be specified multiple times. They can also take multiple
93 fields (though not when a custom header is specified). Examples:
94 
95   --median 2,3,4
96   --median 2-5,7-11
97 
98 The quantile operator requires one or more probabilities after the fields:
99 
100   --quantile 2:0.25                // Quantile 1 of field 2
101   --quantile 2-4:0.25,0.5,0.75     // Q1, Median, Q3 of fields 2, 3, 4
102 
103 Summarization operators available are:
104   count       range        mad            values
105   retain      sum          var            unique-values
106   first       mean         stddev         unique-count
107   last        median       mode           missing-count
108   min         quantile     mode-count     not-missing-count
109   max
110 
111 Numeric values are printed to 12 significant digits by default. This can be
112 changed using the '--p|float-precision' option. If six or less it sets the
113 number of significant digits after the decimal point. If greater than six it
114 sets the total number of significant digits.
115 
116 Calculations hold onto the minimum data needed while reading data. A few
117 operations like median keep all data values in memory. These operations will
118 start to encounter performance issues as available memory becomes scarce. The
119 size that can be handled effectively is machine dependent, but often quite
120 large files can be handled.
121 
122 Operations requiring numeric entries will signal an error and terminate
123 processing if a non-numeric entry is found.
124 
125 Missing values are not treated specially by default, this can be changed
126 using the '--x|exclude-missing' or '--r|replace-missing' option. The former
127 turns off processing for missing values, the latter uses a replacement value.
128 
129 Options:
130 EOS";
131 
132 auto helpText = q"EOS
133 Synopsis: tsv-summarize [options] file [file...]
134 
135 tsv-summarize runs aggregation operations on fields in tab-separated value
136 files. Operations can be run against the full input data or grouped by key
137 fields. Use --help-verbose for more extensive help.
138 
139 Options:
140 EOS";
141 
142 /** Command line options - Container and processing. The processArgs method is used to
143  * process the command line.
144  */
145 struct TsvSummarizeOptions {
146     string programName;
147 
148     /* Options set directly by on the command line.. */
149     size_t[] keyFields;                // -g, --group-by
150     bool hasHeader = false;            // --header
151     bool writeHeader = false;          // -w, --write-header
152     char inputFieldDelimiter = '\t';   // --d|delimiter
153     char valuesDelimiter = '|';        // --v|values-delimiter
154     size_t floatPrecision = 12;        // --p|float-precision
155     bool excludeMissing = false;       // --x|exclude-missing
156     string missingValueReplacement;    // --r|replace-missing
157     bool helpVerbose = false;          // --help-verbose
158     bool versionWanted = false;        // --V|version
159     DList!Operator operators;          // Operators, in the order specified.
160     size_t endFieldIndex = 0;          // Derived value. Max field index used plus one.
161     MissingFieldPolicy globalMissingPolicy = new MissingFieldPolicy;   // Derived value.
162 
163     /* Returns a tuple. First value is true if command line arguments were successfully
164      * processed and execution should continue, or false if an error occurred or the user
165      * asked for help. If false, the second value is the appropriate exit code (0 or 1).
166      *
167      * Returning true (execution continues) means args have been validated and derived
168      * values calculated. In addition, field indices have been converted to zero-based.
169      */
170     auto processArgs (ref string[] cmdArgs) {
171         import std.algorithm : any, each;
172         import std.getopt;
173         import std.path : baseName, stripExtension;
174         import std.typecons : Yes, No;
175         import getopt_inorder;
176         import tsvutil :  makeFieldListOptionHandler;
177 
178         programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name";
179 
180         try
181         {
182             arraySep = ",";    // Use comma to separate values in command line options
183             auto r = getoptInorder(
184                 cmdArgs,
185                 "help-verbose",       "              Print full help.", &helpVerbose,
186 
187                 std.getopt.config.caseSensitive,
188                 "V|version",          "              Print version information and exit.", &versionWanted,
189                 std.getopt.config.caseInsensitive,
190 
191                 "g|group-by",         "<field-list>  Fields to use as key.",
192                 keyFields.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex),
193 
194                 std.getopt.config.caseSensitive,
195                 "H|header",           "              Treat the first line of each file as a header.", &hasHeader,
196                 std.getopt.config.caseInsensitive,
197 
198                 "w|write-header",     "              Write an output header even if there is no input header.", &writeHeader,
199                 "d|delimiter",        "CHR           Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &inputFieldDelimiter,
200                 "v|values-delimiter", "CHR           Values delimiter. Default: vertical bar (|). (Single byte UTF-8 characters only.)", &valuesDelimiter,
201                 "p|float-precision",  "NUM           'Precision' to use printing floating point numbers. Affects the number of digits printed and exponent use. Default: 12", &floatPrecision,
202                 "x|exclude-missing",  "              Exclude missing (empty) fields from calculations.", &excludeMissing,
203                 "r|replace-missing",  "STR           Replace missing (empty) fields with STR in calculations.", &missingValueReplacement,
204                 "count",              "              Count occurrences of each unique key.", &countOptionHandler,
205                 "count-header",       "STR           Count occurrences of each unique key, use header STR.", &countHeaderOptionHandler,
206                 "retain",             "<field-list>  Retain one copy of the field.", &operatorOptionHandler!RetainOperator,
207                 "first",              "<field-list>[:STR]  First value seen.", &operatorOptionHandler!FirstOperator,
208                 "last",               "<field-list>[:STR]  Last value seen.", &operatorOptionHandler!LastOperator,
209                 "min",                "<field-list>[:STR]  Min value. (Numeric fields only.)", &operatorOptionHandler!MinOperator,
210                 "max",                "<field-list>[:STR]  Max value. (Numeric fields only.)", &operatorOptionHandler!MaxOperator,
211                 "range",              "<field-list>[:STR]  Difference between min and max values. (Numeric fields only.)", &operatorOptionHandler!RangeOperator,
212                 "sum",                "<field-list>[:STR]  Sum of the values. (Numeric fields only.)", &operatorOptionHandler!SumOperator,
213                 "mean",               "<field-list>[:STR]  Mean (average). (Numeric fields only.)", &operatorOptionHandler!MeanOperator,
214                 "median",             "<field-list>[:STR]  Median value. (Numeric fields only. Reads all values into memory.)", &operatorOptionHandler!MedianOperator,
215                 "quantile",           "<field-list>:p[,p...][:STR]  Quantiles. One or more fields, then one or more 0.0-1.0 probabilities. (Numeric fields only. Reads all values into memory.)", &quantileOperatorOptionHandler,
216                 "mad",                "<field-list>[:STR]  Median absolute deviation from the median. Raw value, not scaled. (Numeric fields only. Reads all values into memory.)", &operatorOptionHandler!MadOperator,
217                 "var",                "<field-list>[:STR]  Variance. (Sample variance, numeric fields only).", &operatorOptionHandler!VarianceOperator,
218                 "stdev",              "<field-list>[:STR]  Standard deviation. (Sample st.dev, numeric fields only).", &operatorOptionHandler!StDevOperator,
219                 "mode",               "<field-list>[:STR]  Mode. The most frequent value. (Reads all unique values into memory.)", &operatorOptionHandler!ModeOperator,
220                 "mode-count",         "<field-list>[:STR]  Count of the most frequent value. (Reads all unique values into memory.)", &operatorOptionHandler!ModeCountOperator,
221                 "unique-count",       "<field-list>[:STR]  Number of unique values. (Reads all unique values into memory.)", &operatorOptionHandler!UniqueCountOperator,
222                 "missing-count",      "<field-list>[:STR]  Number of missing (empty) fields. Not affected by '--x|exclude-missing' or '--r|replace-missing'.", &operatorOptionHandler!MissingCountOperator,
223                 "not-missing-count",  "<field-list>[:STR]  Number of filled (non-empty) fields. Not affected by '--r|replace-missing'.", &operatorOptionHandler!NotMissingCountOperator,
224                 "values",             "<field-list>[:STR]  All the values, separated by --v|values-delimiter. (Reads all values into memory.)", &operatorOptionHandler!ValuesOperator,
225                 "unique-values",      "<field-list>[:STR]  All the unique values, separated by --v|values-delimiter. (Reads all unique values into memory.)", &operatorOptionHandler!UniqueValuesOperator,
226                 );
227 
228             if (r.helpWanted)
229             {
230                 defaultGetoptPrinter(helpText, r.options);
231                 return tuple(false, 0);
232             }
233             else if (helpVerbose)
234             {
235                 defaultGetoptPrinter(helpTextVerbose, r.options);
236                 return tuple(false, 0);
237             }
238             else if (versionWanted)
239             {
240                 import tsvutils_version;
241                 writeln(tsvutilsVersionNotice("tsv-summarize"));
242                 return tuple(false, 0);
243             }
244 
245             consistencyValidations();
246             derivations();
247         }
248         catch (Exception exc)
249         {
250             stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg);
251             return tuple(false, 1);
252         }
253         return tuple(true, 0);
254     }
255 
256     /* operationOptionHandler functions are callbacks that process command line options
257      * specifying summarization operations. eg. '--max 5', '--last 3:LastEntry'. Handlers
258      * check syntactic correctness and instantiate Operator objects that do the work. This
259      * is also where 1-upped field numbers are converted to 0-based indices.
260      */
261     private void operatorOptionHandler(OperatorClass : SingleFieldOperator)(string option, string optionVal)
262     {
263         import std.range : enumerate;
264         import std.typecons : Yes, No;
265         import tsvutil :  parseFieldList;
266 
267         auto valSplit = findSplit(optionVal, ":");
268 
269         if (valSplit[0].empty || (!valSplit[1].empty && valSplit[2].empty))
270         {
271             throw new Exception(
272                 format("Invalid option value: '--%s %s'. Expected: '--%s <field-list>' or '--%s <field>:<header>'.",
273                        option, optionVal, option, option));
274         }
275 
276         try foreach (fieldNum, fieldIndex;
277                      valSplit[0].to!string
278                      .parseFieldList!(size_t, Yes.convertToZeroBasedIndex).enumerate(1))
279             {
280                 auto op = new OperatorClass(fieldIndex, globalMissingPolicy);
281 
282                 if (!valSplit[2].empty) // Header specified
283                 {
284                     if (fieldNum > 1)
285                     {
286                         throw new Exception(
287                             format("Invalid option: '--%s %s'. Cannot specify a custom header when using multiple fields.",
288                                    option, optionVal));
289                     }
290                     else if (!op.allowCustomHeader)
291                     {
292                         throw new Exception(
293                             format("Invalid option: '--%s %s'. Operator does not support custom headers.",
294                                    option, optionVal));
295                     }
296 
297                     op.setCustomHeader(valSplit[2].to!string);
298                 }
299 
300                 operators.insertBack(op);
301                 if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1;
302             }
303         catch (Exception exc)
304         {
305             import std.format : format;
306             exc.msg = format("[--%s] %s", option, exc.msg);
307             throw exc;
308         }
309     }
310 
311     /* QuantileOperator has a different syntax and needs a custom command option handler. */
312     private void quantileOperatorOptionHandler(string option, string optionVal)
313     {
314         import std.typecons : Yes, No;
315         import tsvutil :  parseFieldList;
316 
317         auto formatErrorMsg(string option, string optionVal)
318         {
319             return format(
320                 "Invalid option value: '--%s %s'. Expected: '--%s <field-list>:<prob>[,<prob>]' or '--%s <field>:<prob>:<header>' where <prob> is a number between 0.0 and 1.0.",
321                 option, optionVal, option, option);
322         }
323 
324         auto split1 = findSplit(optionVal, ":");
325 
326         if (split1[0].empty || (!split1[1].empty && split1[2].empty))
327             throw new Exception(formatErrorMsg(option, optionVal));
328 
329         auto split2 = findSplit(split1[2], ":");
330 
331         if (split2[0].empty || (!split2[1].empty && split2[2].empty))
332             throw new Exception(formatErrorMsg(option, optionVal));
333 
334         auto fieldStr = split1[0];
335         auto probStr = split2[0];
336         auto header = split2[2];
337 
338         size_t[] fieldIndices;
339         double[] probs;
340 
341         try foreach (fieldIndex;
342                      fieldStr.to!string.parseFieldList!(size_t, Yes.convertToZeroBasedIndex))
343             {
344                 fieldIndices ~= fieldIndex;
345             }
346         catch (Exception exc)
347         {
348             import std.format : format;
349             exc.msg = format("[--%s] %s", option, exc.msg);
350             throw exc;
351         }
352 
353         foreach (str; probStr.splitter(','))
354         {
355             double p;
356 
357             try p = str.to!double;
358             catch (Exception exc)
359                 throw new Exception(formatErrorMsg(option, optionVal));
360 
361             if (!(p >= 0.0 && p <= 1.0))
362                 throw new Exception(
363                     format("Invalid option: '--%s %s'. Probability '%g' is not in the interval [0.0,1.0].",
364                            option, optionVal, p));
365 
366             probs ~= p;
367         }
368 
369         if (!header.empty && (fieldIndices.length > 1 || probs.length > 1))
370         {
371             throw new Exception(
372                 format("Invalid option: '--%s %s'. Cannot specify a custom header when using multiple fields or multiple probabilities.",
373                        option, optionVal));
374         }
375 
376         assert (fieldIndices.length > 0);
377         assert (probs.length > 0);
378         assert (header.empty || (fieldIndices.length == 1 && probs.length == 1));
379 
380         foreach (fieldIndex; fieldIndices)
381         {
382             foreach (p; probs)
383             {
384                 auto op = new QuantileOperator(fieldIndex, globalMissingPolicy, p);
385                 if (!header.empty) op.setCustomHeader(header);
386                 operators.insertBack(op);
387             }
388             if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1;
389         }
390     }
391 
392     private void countOptionHandler()
393     {
394         operators.insertBack(new CountOperator());
395     }
396 
397     private void countHeaderOptionHandler(string option, string optionVal)
398     {
399         auto op = new CountOperator();
400         op.setCustomHeader(optionVal);
401         operators.insertBack(op);
402     }
403 
404     /* This routine does validations not handled by processArgs. */
405     private void consistencyValidations()
406     {
407         if (operators.empty)
408         {
409             throw new Exception("At least one summary operator is required.");
410         }
411 
412         if (inputFieldDelimiter == valuesDelimiter)
413         {
414             throw new Exception("Cannot use the same character for both --d|field-delimiter and --v|values-delimiter.");
415         }
416 
417         if (excludeMissing && missingValueReplacement.length != 0)
418         {
419             throw new Exception("Cannot use both '--x|exclude-missing' and '--r|replace-missing'.");
420         }
421     }
422 
423     /* Post-processing derivations. */
424     void derivations()
425     {
426         /* keyFields need to part of the endFieldIndex, which is one past the last field index. */
427         keyFields.each!(delegate (size_t x) { if (x >= endFieldIndex) endFieldIndex = x + 1; } );
428 
429         /* Missing field policy. */
430         globalMissingPolicy.updatePolicy(excludeMissing, missingValueReplacement);
431     }
432 }
433 
434 /** tsvSummarize does the primary work of the tsv-summarize program.
435  */
436 void tsvSummarize(TsvSummarizeOptions cmdopt, in string[] inputFiles)
437 {
438     import tsvutil : throwIfWindowsNewlineOnUnix;
439 
440     /* Pick the Summarizer based on the number of key-fields entered. */
441     auto summarizer =
442         (cmdopt.keyFields.length == 0)
443         ? new NoKeySummarizer!(typeof(stdout.lockingTextWriter()))(
444             cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
445 
446         : (cmdopt.keyFields.length == 1)
447         ? new OneKeySummarizer!(typeof(stdout.lockingTextWriter()))(
448             cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
449 
450         : new MultiKeySummarizer!(typeof(stdout.lockingTextWriter()))(
451             cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy);
452 
453     /* Add the operators to the Summarizer. */
454     summarizer.setOperators(inputRangeObject(cmdopt.operators[]));
455 
456     /* Process each input file, one line at a time. */
457     auto lineFields = new char[][](cmdopt.endFieldIndex);
458     bool headerFound = false;
459     foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"])
460     {
461         auto inputStream = (filename == "-") ? stdin : filename.File();
462         foreach (lineNum, line; inputStream.byLine.enumerate(1))
463         {
464             if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, lineNum);
465 
466             /* Copy the needed number of fields to the fields array.
467              * Note: The number is zero if no operator needs fields. Notably, the count
468              * operator. Used by itself, it counts the number input lines (ala 'wc -l').
469              */
470             if (cmdopt.endFieldIndex > 0)
471             {
472                 size_t fieldIndex = 0;
473                 foreach (fieldValue; line.splitter(cmdopt.inputFieldDelimiter))
474                 {
475                     if (fieldIndex == cmdopt.endFieldIndex) break;
476                     lineFields[fieldIndex] = fieldValue;
477                     fieldIndex++;
478                 }
479 
480                 if (fieldIndex == 0)
481                 {
482                     assert(cmdopt.endFieldIndex > 0);
483                     assert(line.length == 0);
484 
485                     /* Bug work-around. Empty lines are not handled properly by splitter.
486                      *   - Bug: https://issues.dlang.org/show_bug.cgi?id=15735
487                      *   - Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030
488                      * This can arise for: '$ tsv-summarize -g 1 --count'. This counts the
489                      * unique values in field 1. If there's only one column, then an empty
490                      * line becomes an empty string for field 1. Work-around: Point to the
491                      * line. It's an empty string.
492                      */
493                     lineFields[fieldIndex] = line;
494                     fieldIndex++;
495                 }
496 
497                 if (fieldIndex < cmdopt.endFieldIndex)
498                 {
499                     throw new Exception(
500                         format("Not enough fields in line. File: %s, Line: %s",
501                                (filename == "-") ? "Standard Input" : filename, lineNum));
502                 }
503             }
504 
505             if (cmdopt.hasHeader && lineNum == 1)
506             {
507                 if (!headerFound)
508                 {
509                     summarizer.processHeaderLine(lineFields);
510                     headerFound = true;
511                 }
512             }
513             else
514             {
515                 /* Process the line. Processing will fail (throw) if a field cannot be
516                  * converted to the expected type.
517                  */
518                 try summarizer.processNextLine(lineFields);
519                 catch (Exception exc)
520                 {
521                     throw new Exception(
522                         format("Could not process line or field: %s\n  File: %s Line: %s%s",
523                                exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum,
524                                (lineNum == 1) ? "\n  Is this a header line? Use --header to skip." : ""));
525                 }
526             }
527         }
528     }
529 
530     debug writeln("[tsvSummarize] After reading all data.");
531 
532     /* Whew! We're done processing input data. Run the calculations and print. */
533     auto printOptions = SummarizerPrintOptions(
534         cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision);
535     auto stdoutWriter = stdout.lockingTextWriter;
536 
537     if (cmdopt.hasHeader || cmdopt.writeHeader)
538     {
539         summarizer.writeSummaryHeader(stdoutWriter, printOptions);
540     }
541 
542     summarizer.writeSummaryBody(stdoutWriter, printOptions);
543 }
544 
545 /** The default field header. This is used when the input doesn't have field headers,
546  * but field headers are used in the output. The default is "fieldN", where N is the
547  * 1-upped field number.
548  */
549 string fieldHeaderFromIndex(size_t fieldIndex)
550 {
551     enum prefix = "field";
552     return prefix ~ (fieldIndex + 1).to!string;
553 }
554 
555 unittest
556 {
557     assert(fieldHeaderFromIndex(0) == "field1");
558     assert(fieldHeaderFromIndex(10) == "field11");
559 }
560 
561 /** Produce a summary header from a field header.
562  *
563  * The result has the form `<fieldHeader>_<operation>`. e.g. If the field header is
564  * "length" and the operation is "max", the summary header is "length_max". The field
565  * header typically comes a header line in the input data or was constructed by
566  * fieldHeaderFromIndex().
567  *
568  * If operationName is the empty string, then fieldHeader is used unchanged. This supports
569  * the Retain operator.
570  */
571 string summaryHeaderFromFieldHeader(string fieldHeader, string operationName)
572 {
573     return (operationName.length > 0) ? fieldHeader ~ "_" ~ operationName : fieldHeader;
574 }
575 
576 unittest
577 {
578     assert(summaryHeaderFromFieldHeader("originalfield", "mycalc") == "originalfield_mycalc");
579     assert(summaryHeaderFromFieldHeader("originalfield", "") == "originalfield");
580 }
581 
582 /** SummarizerPrintOptions holds printing options for Summarizers and Calculators. Typically
583  * specified with command line options, it is separated out for modularity.
584  */
585 struct SummarizerPrintOptions
586 {
587     char fieldDelimiter;
588     char valuesDelimiter;
589     size_t floatPrecision = 12;
590 
591     import std.traits : isFloatingPoint, isIntegral;
592 
593     auto formatNumber(T)(T n) const
594     if (isFloatingPoint!T || isIntegral!T)
595     {
596         import tsv_numerics : formatNumber;
597         return formatNumber!T(n, floatPrecision);
598     }
599 }
600 
601 /** A Summarizer object maintains the state of the summarization and performs basic
602  * processing. Handling of files and input lines is left to the caller.
603  *
604  * Classes supporting the Summarizer must implement the methods:
605  *  - setOperators - Called after initializing the object for each operator to be processed.
606  *  - processHeaderLine - Called to process the header line of each file. Returns true if
607  *   it was the first header line processed (used when reading multiple files).
608  * - processNextLine - Called to process non-header lines.
609  * - writeSummaryHeader - Called to write the header line.
610  * - writeSummaryBody - Called to write the result lines.
611  *
612  */
613 interface Summarizer(OutputRange)
614 {
615     /** Called after initializing the object for each operator to be processed. */
616     void setOperators(InputRange!Operator op);
617 
618     /** Called to process the header line of each file. Returns true if it was the
619      *  first header line processed (used when reading multiple files).
620      */
621     bool processHeaderLine(const char[][] lineFields);
622 
623     /** Called to process non-header lines. */
624     void processNextLine(const char[][] lineFields);
625 
626     /** Called to write the header line. */
627     void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions);
628 
629     /** Called to write the result lines. */
630     void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions);
631 }
632 
633 /** SummarizerBase performs work shared by all sumarizers, most everything except for
634  * handling of unique keys.
635  *
636  * The base class handles creation, allocates storage for Operators and SharedFieldValues,
637  * and similar. Derived classes deal primarily with unique keys and the associated Calculators
638  * and UniqueKeyValuesLists.
639  */
640 class SummarizerBase(OutputRange) : Summarizer!OutputRange
641 {
642     private char _inputFieldDelimiter;
643     private bool _hasProcessedFirstHeaderLine = false;
644     private SharedFieldValues _sharedFieldValues = null;  // Null if no shared field value lists.
645     protected MissingFieldPolicy _missingPolicy;
646     protected DList!Operator _operators;
647     protected size_t _numOperators = 0;
648 
649     this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
650     {
651         _inputFieldDelimiter = inputFieldDelimiter;
652         _missingPolicy = missingPolicy;
653     }
654 
655     char inputFieldDelimiter() const @property
656     {
657         return _inputFieldDelimiter;
658     }
659 
660     /** Sets the Operators used by the Summarizer. Called after construction. */
661     void setOperators(InputRange!Operator operators)
662     {
663         foreach (op; operators)
664         {
665             _operators.insertBack(op);
666             _numOperators++;
667             auto numericFieldsToSave = op.numericFieldsToSave();
668             auto textFieldsToSave = op.textFieldsToSave();
669 
670             if (numericFieldsToSave.length > 0 || textFieldsToSave.length > 0)
671             {
672                 if (_sharedFieldValues is null)
673                 {
674                     _sharedFieldValues = new SharedFieldValues();
675                 }
676                 numericFieldsToSave.each!(x => _sharedFieldValues.addNumericIndex(x));
677                 textFieldsToSave.each!(x => _sharedFieldValues.addTextIndex(x));
678             }
679         }
680     }
681 
682     /** Called to process the header line of each file. Returns true if it was the
683      *  first header line processed (used when reading multiple files).
684      */
685     bool processHeaderLine(const char[][] lineFields)
686     {
687         if (!_hasProcessedFirstHeaderLine)
688         {
689             _operators.each!(x => x.processHeaderLine(lineFields));
690             _hasProcessedFirstHeaderLine = true;
691             return true;
692         }
693         else
694         {
695             return false;
696         }
697     }
698 
699     protected final UniqueKeyValuesLists makeUniqueKeyValuesLists()
700     {
701         return (_sharedFieldValues is null)
702             ? null
703             : _sharedFieldValues.makeUniqueKeyValuesLists;
704     }
705 
706     abstract void processNextLine(const char[][] lineFields);
707     abstract void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions);
708     abstract void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions);
709 }
710 
711 /** The NoKeySummarizer is used when summarizing values across the entire input.
712  *
713  * Note: NoKeySummarizer is used in Operator unit tests and gets extensive testing
714  * through that mechanism.
715  */
716 class NoKeySummarizer(OutputRange) : SummarizerBase!OutputRange
717 {
718     private Calculator[] _calculators;
719     private UniqueKeyValuesLists _valueLists;
720 
721     this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
722     {
723         super(inputFieldDelimiter, missingPolicy);
724     }
725 
726     /** Called after initializing the object for each operator to be processed. */
727     override void setOperators(InputRange!Operator operators)
728     {
729         super.setOperators(operators);
730 
731         /* Only one Calculator per Operation, so create them as Operators are added. */
732         foreach (op; operators) _calculators ~= op.makeCalculator;
733         _valueLists = super.makeUniqueKeyValuesLists();
734     }
735 
736      /** Called to process non-header lines. */
737     override void processNextLine(const char[][] lineFields)
738     {
739         _calculators.each!(x => x.processNextLine(lineFields));
740         if (_valueLists !is null) _valueLists.processNextLine(lineFields, _missingPolicy);
741     }
742 
743     /** Called to write the header line. */
744     override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
745     {
746         put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter));
747         put(outputStream, '\n');
748     }
749 
750     /** Called to write the result lines. */
751     override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
752     {
753         put(outputStream,
754             _calculators[]
755             .map!(x => x.calculate(_valueLists, printOptions))
756             .join(printOptions.fieldDelimiter));
757         put(outputStream, '\n');
758     }
759 }
760 
761 /** KeySummarizerBase does work shared by the single key and multi-key summarizers.
762  *
763  * The primary difference between those two is the formation of the key. The primary
764  * reason for separating those into two separate classes is to simplify (speed-up)
765  * handling of single field keys, which are the most common use case.
766  */
767 class KeySummarizerBase(OutputRange) : SummarizerBase!OutputRange
768 {
769     protected struct UniqueKeyData
770     {
771         Calculator[] calculators;
772         UniqueKeyValuesLists valuesLists;
773     }
774 
775     private DList!string _uniqueKeys;
776     private UniqueKeyData[string] _uniqueKeyData;
777 
778     this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
779     {
780         super(inputFieldDelimiter, missingPolicy);
781     }
782 
783     protected void processNextLineWithKey(T : const char[])(T key, const char[][] lineFields)
784     {
785         debug writefln("[%s]: %s", __FUNCTION__, lineFields.to!string);
786 
787         auto dataPtr = (key in _uniqueKeyData);
788         auto data = (dataPtr is null) ? addUniqueKey(key.to!string) : *dataPtr;
789 
790         data.calculators.each!(x => x.processNextLine(lineFields));
791         if (data.valuesLists !is null) data.valuesLists.processNextLine(lineFields, _missingPolicy);
792     }
793 
794     protected UniqueKeyData addUniqueKey(string key)
795     {
796         assert(key !in _uniqueKeyData);
797 
798         _uniqueKeys.insertBack(key);
799 
800         auto calculators = new Calculator[_numOperators];
801         size_t i = 0;
802         foreach (op; _operators)
803         {
804             calculators[i] = op.makeCalculator;
805             i++;
806         }
807 
808         return _uniqueKeyData[key] = UniqueKeyData(calculators, super.makeUniqueKeyValuesLists());
809     }
810 
811     override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
812     {
813         put(outputStream, keyFieldHeader());
814         put(outputStream, printOptions.fieldDelimiter);
815         put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter));
816         put(outputStream, '\n');
817     }
818 
819     override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions)
820     {
821         foreach(key; _uniqueKeys)
822         {
823             auto data = _uniqueKeyData[key];
824             put(outputStream, key);
825             put(outputStream, printOptions.fieldDelimiter);
826             put(outputStream,
827                 data.calculators[]
828                 .map!(x => x.calculate(data.valuesLists, printOptions))
829                 .join(printOptions.fieldDelimiter));
830             put(outputStream, '\n');
831         }
832     }
833 
834     abstract string keyFieldHeader() const @property;
835 }
836 
837 /** This Summarizer is for the case where the unique key is based on exactly one field.
838  */
839 class OneKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange
840 {
841     private size_t _keyFieldIndex = 0;
842     private string _keyFieldHeader;
843     private DList!string _uniqueKeys;
844 
845     this(size_t keyFieldIndex, char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
846     {
847         super(inputFieldDelimiter, missingPolicy);
848         _keyFieldIndex = keyFieldIndex;
849         _keyFieldHeader = fieldHeaderFromIndex(keyFieldIndex);
850     }
851 
852     override string keyFieldHeader() const @property
853     {
854         return _keyFieldHeader;
855     }
856 
857     override bool processHeaderLine(const char[][] lineFields)
858     {
859         assert(_keyFieldIndex <= lineFields.length);
860 
861         bool isFirstHeaderLine = super.processHeaderLine(lineFields);
862         if (isFirstHeaderLine)
863         {
864             _keyFieldHeader = lineFields[_keyFieldIndex].to!string;
865         }
866         return isFirstHeaderLine;
867     }
868 
869     override void processNextLine(const char[][] lineFields)
870     {
871         assert(_keyFieldIndex < lineFields.length);
872         processNextLineWithKey(lineFields[_keyFieldIndex], lineFields);
873     }
874 }
875 
876 /** This Summarizer is for the case where the unique key is based on multiple fields.
877  */
878 class MultiKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange
879 {
880     private size_t[] _keyFieldIndices;
881     private string _keyFieldHeader;
882     private DList!string _uniqueKeys;
883 
884     this(const size_t[] keyFieldIndices, char inputFieldDelimiter, MissingFieldPolicy missingPolicy)
885     {
886         super(inputFieldDelimiter, missingPolicy);
887         _keyFieldIndices = keyFieldIndices.dup;
888         _keyFieldHeader =
889             _keyFieldIndices.map!(i => fieldHeaderFromIndex(i))
890             .join(inputFieldDelimiter);
891     }
892 
893     override string keyFieldHeader() const @property
894     {
895         return _keyFieldHeader;
896     }
897 
898     override bool processHeaderLine(const char[][] lineFields)
899     {
900         assert(_keyFieldIndices.all!(x => x < lineFields.length));
901         assert(_keyFieldIndices.length >= 2);
902 
903         bool isFirstHeaderLine = super.processHeaderLine(lineFields);
904         if (isFirstHeaderLine)
905         {
906             _keyFieldHeader = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string;
907         }
908         return isFirstHeaderLine;
909     }
910 
911     override void processNextLine(const char[][] lineFields)
912     {
913         assert(_keyFieldIndices.all!(x => x < lineFields.length));
914         assert(_keyFieldIndices.length >= 2);
915 
916         string key = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string;
917         processNextLineWithKey(key, lineFields);
918     }
919 }
920 
921 version(unittest)
922 {
923     /* testSummarizer is a helper that can run many types of unit tests against
924      * Summarizers. It can also test operators, but there are separate helper functions
925      * better suited for that purpose.
926      *
927      * Arguments are a command line args, an input file, and expected output. The
928      * input file and expected output are already split into lines and fields, the helper
929      * manages re-assembly. The program name from the command line args is printed if an
930      * an error occurs, it is useful to identify the test that failed.
931      *
932      * Note: Much of this is a duplication tsvSummarize logic. Better abstraction of
933      * file input/output would enable running unit tests directly on top of tsvSummarize.
934      */
935     void testSummarizer(string[] cmdArgs, string[][] file, string[][] expected)
936     {
937         import std.array : appender;
938 
939         assert(cmdArgs.length > 0, "[testSummarizer] cmdArgs must not be empty.");
940 
941         auto formatAssertMessage(T...)(string msg, T formatArgs)
942         {
943             auto formatString = "[testSummarizer] %s: " ~ msg;
944             return format(formatString, cmdArgs[0], formatArgs);
945         }
946 
947         TsvSummarizeOptions cmdopt;
948         auto savedCmdArgs = cmdArgs.to!string;
949         auto r = cmdopt.processArgs(cmdArgs);
950         assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs));
951 
952         assert(file.all!(line => line.length >= cmdopt.endFieldIndex),
953                formatAssertMessage("group-by or operator field number greater than number of fields a line of the input file."));
954 
955         /* Pick the Summarizer based on the number of key-fields entered. */
956         auto summarizer =
957             (cmdopt.keyFields.length == 0)
958             ? new NoKeySummarizer!(typeof(appender!(char[])()))(
959                 cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
960 
961             : (cmdopt.keyFields.length == 1)
962             ? new OneKeySummarizer!(typeof(appender!(char[])()))(
963                 cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy)
964 
965             : new MultiKeySummarizer!(typeof(appender!(char[])()))(
966                 cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy);
967 
968         /* Add the operators to the Summarizer. */
969         summarizer.setOperators(inputRangeObject(cmdopt.operators[]));
970 
971         /* Process the file one line at a time. */
972         auto lineFields = new char[][](cmdopt.endFieldIndex);
973         bool headerFound = false;
974         foreach (lineNum, line; file.enumerate(1))
975         {
976             /* Copy the needed fields to the fields array. */
977             foreach (i, val; line[0..cmdopt.endFieldIndex]) lineFields[i] = val.dup;
978 
979             if (cmdopt.hasHeader && lineNum == 1)
980             {
981                 if (!headerFound)
982                 {
983                     summarizer.processHeaderLine(lineFields);
984                     headerFound = true;
985                 }
986             }
987             else
988             {
989                 try summarizer.processNextLine(lineFields);
990                 catch (Exception exc)
991                 {
992                     assert(false, formatAssertMessage(exc.msg));
993                 }
994             }
995         }
996         auto printOptions = SummarizerPrintOptions(
997         cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision);
998 
999         auto summarizerOutput = appender!(char[])();
1000 
1001         if (cmdopt.hasHeader || cmdopt.writeHeader)
1002         {
1003             summarizer.writeSummaryHeader(summarizerOutput, printOptions);
1004         }
1005 
1006         summarizer.writeSummaryBody(summarizerOutput, printOptions);
1007         auto expectedOutput = expected.map!(x => x.joiner(cmdopt.inputFieldDelimiter.to!string)).joiner("\n").to!string;
1008         if (expectedOutput.length > 0 && expectedOutput[$-1] != '\n') expectedOutput ~= "\n";
1009 
1010         assert(summarizerOutput.data == expectedOutput,
1011                formatAssertMessage(
1012                    "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================",
1013                    expectedOutput.to!string, summarizerOutput.data.to!string));
1014     }
1015 }
1016 
1017 unittest
1018 {
1019     /* Summarizer unit tests. Primarily single-key and multi-key summarizers. To a limited
1020      * extent, command line option handling (TsvSummarizeOptions). Individual operators
1021      * have separate tests, those tests test the no-key summarizer. The Values operator is
1022      * used in these tests. It engages a number of behaviors, and the results have limited
1023      * ambiguity. Using only one operator limits dependence on individual operators.
1024      */
1025 
1026     auto file1 = [["fld1", "fld2", "fld3"],
1027                   ["a", "a",  "3"],
1028                   ["c", "a",  "2b"],
1029                   ["c", "bc", ""],
1030                   ["a", "c",  "2b"],
1031                   ["",  "bc", ""],
1032                   ["c", "bc", "3"]];
1033 
1034     /* Single-key summarizer tests.
1035      */
1036     testSummarizer(["unittest-sk-1", "--header", "--group-by", "1", "--values", "1"],
1037                    file1,
1038                    [["fld1", "fld1_values"],
1039                     ["a", "a|a"],
1040                     ["c", "c|c|c"],
1041                     ["",  ""]]
1042         );
1043     testSummarizer(["unittest-sk-2", "-H", "--group-by", "1", "--values", "2"],
1044                    file1,
1045                    [["fld1", "fld2_values"],
1046                     ["a", "a|c"],
1047                     ["c", "a|bc|bc"],
1048                     ["",  "bc"]]
1049         );
1050     testSummarizer(["unittest-sk-3", "-H", "-g", "1", "--values", "3"],
1051                    file1,
1052                    [["fld1", "fld3_values"],
1053                     ["a", "3|2b"],
1054                     ["c", "2b||3"],
1055                     ["",  ""]]
1056         );
1057     testSummarizer(["unittest-sk-4", "-H", "--group-by", "1", "--values", "1,2,3"],
1058                    file1,
1059                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1060                     ["a", "a|a",   "a|c",     "3|2b"],
1061                     ["c", "c|c|c", "a|bc|bc", "2b||3"],
1062                     ["",  "",      "bc",      ""]]
1063         );
1064     testSummarizer(["unittest-sk-5", "-H", "--group-by", "1", "--values", "1-3"],
1065                    file1,
1066                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1067                     ["a", "a|a",   "a|c",     "3|2b"],
1068                     ["c", "c|c|c", "a|bc|bc", "2b||3"],
1069                     ["",  "",      "bc",      ""]]
1070         );
1071     testSummarizer(["unittest-sk-6", "-H", "--group-by", "1", "--values", "3,2,1"],
1072                    file1,
1073                    [["fld1", "fld3_values", "fld2_values", "fld1_values"],
1074                     ["a", "3|2b",  "a|c",     "a|a"],
1075                     ["c", "2b||3", "a|bc|bc", "c|c|c"],
1076                     ["",  "",      "bc",      ""]]
1077         );
1078     testSummarizer(["unittest-sk-7", "-H", "--group-by", "1", "--values", "3-1"],
1079                    file1,
1080                    [["fld1", "fld3_values", "fld2_values", "fld1_values"],
1081                     ["a", "3|2b",  "a|c",     "a|a"],
1082                     ["c", "2b||3", "a|bc|bc", "c|c|c"],
1083                     ["",  "",      "bc",      ""]]
1084         );
1085     testSummarizer(["unittest-sk-8", "-H", "--group-by", "2", "--values", "1"],
1086                    file1,
1087                    [["fld2", "fld1_values"],
1088                     ["a",  "a|c"],
1089                     ["bc", "c||c"],
1090                     ["c",  "a"]]
1091         );
1092     testSummarizer(["unittest-sk-9", "-H", "--group-by", "2", "--values", "2"],
1093                    file1,
1094                    [["fld2", "fld2_values"],
1095                     ["a",  "a|a"],
1096                     ["bc", "bc|bc|bc"],
1097                     ["c",  "c"]]
1098         );
1099     testSummarizer(["unittest-sk-10", "-H", "--group-by", "2", "--values", "3"],
1100                    file1,
1101                    [["fld2", "fld3_values"],
1102                     ["a",  "3|2b"],
1103                     ["bc", "||3"],
1104                     ["c",  "2b"]]
1105         );
1106     testSummarizer(["unittest-sk-11", "-H", "--group-by", "2", "--values", "1,3"],
1107                    file1,
1108                    [["fld2", "fld1_values", "fld3_values"],
1109                     ["a",  "a|c",  "3|2b"],
1110                     ["bc", "c||c", "||3"],
1111                     ["c",  "a",    "2b"]]
1112         );
1113     testSummarizer(["unittest-sk-12", "-H", "--group-by", "2", "--values", "3,1"],
1114                    file1,
1115                    [["fld2", "fld3_values", "fld1_values"],
1116                     ["a",  "3|2b", "a|c"],
1117                     ["bc", "||3",  "c||c"],
1118                     ["c",  "2b",   "a"]]
1119         );
1120     testSummarizer(["unittest-sk-13", "-H", "--group-by", "3", "--values", "1"],
1121                    file1,
1122                    [["fld3", "fld1_values"],
1123                     ["3",  "a|c"],
1124                     ["2b", "c|a"],
1125                     ["",   "c|"]]
1126         );
1127     testSummarizer(["unittest-sk-14", "-H", "--group-by", "3", "--values", "2"],
1128                    file1,
1129                    [["fld3", "fld2_values"],
1130                     ["3",  "a|bc"],
1131                     ["2b", "a|c"],
1132                     ["",   "bc|bc"]]
1133         );
1134     testSummarizer(["unittest-sk-15", "-H", "--group-by", "3", "--values", "1,2"],
1135                    file1,
1136                    [["fld3", "fld1_values", "fld2_values"],
1137                     ["3",  "a|c", "a|bc"],
1138                     ["2b", "c|a", "a|c"],
1139                     ["",   "c|",  "bc|bc"]]
1140         );
1141 
1142     /* Multi-key summarizer tests.
1143      */
1144     testSummarizer(["unittest-mk-1", "--header", "--group-by", "1,2", "--values", "1"],
1145                    file1,
1146                    [["fld1", "fld2", "fld1_values"],
1147                     ["a", "a",  "a"],
1148                     ["c", "a",  "c"],
1149                     ["c", "bc", "c|c"],
1150                     ["a", "c",  "a"],
1151                     ["", "bc",  ""]]
1152         );
1153     testSummarizer(["unittest-mk-2", "-H", "--group-by", "1,2", "--values", "2"],
1154                    file1,
1155                    [["fld1", "fld2", "fld2_values"],
1156                     ["a", "a",  "a"],
1157                     ["c", "a",  "a"],
1158                     ["c", "bc", "bc|bc"],
1159                     ["a", "c",  "c"],
1160                     ["", "bc",  "bc"]]
1161         );
1162     testSummarizer(["unittest-mk-3", "-H", "--group-by", "1,2", "--values", "3"],
1163                    file1,
1164                    [["fld1", "fld2", "fld3_values"],
1165                     ["a", "a",  "3"],
1166                     ["c", "a",  "2b"],
1167                     ["c", "bc", "|3"],
1168                     ["a", "c",  "2b"],
1169                     ["", "bc",  ""]]
1170         );
1171     testSummarizer(["unittest-mk-4", "-H", "--group-by", "1,2", "--values", "3,1"],
1172                    file1,
1173                    [["fld1", "fld2", "fld3_values", "fld1_values"],
1174                     ["a", "a",  "3", "a"],
1175                     ["c", "a",  "2b", "c"],
1176                     ["c", "bc", "|3", "c|c"],
1177                     ["a", "c",  "2b", "a"],
1178                     ["",  "bc", "",   ""]]
1179         );
1180     testSummarizer(["unittest-mk-5", "-H", "--group-by", "3,2", "--values", "1"],
1181                    file1,
1182                    [["fld3", "fld2", "fld1_values"],
1183                     ["3",  "a",  "a"],
1184                     ["2b", "a",  "c"],
1185                     ["",   "bc", "c|"],
1186                     ["2b", "c",  "a"],
1187                     ["3",  "bc", "c"]]
1188         );
1189     testSummarizer(["unittest-mk-6", "-H", "--group-by", "3-2", "--values", "1"],
1190                    file1,
1191                    [["fld3", "fld2", "fld1_values"],
1192                     ["3",  "a",  "a"],
1193                     ["2b", "a",  "c"],
1194                     ["",   "bc", "c|"],
1195                     ["2b", "c",  "a"],
1196                     ["3",  "bc", "c"]]
1197         );
1198     testSummarizer(["unittest-mk-7", "-H", "--group-by", "2,1,3", "--values", "2"],
1199                    file1,
1200                    [["fld2", "fld1", "fld3", "fld2_values"],
1201                     ["a",  "a", "3",  "a"],
1202                     ["a",  "c", "2b", "a"],
1203                     ["bc", "c", "",   "bc"],
1204                     ["c",  "a", "2b", "c"],
1205                     ["bc", "",  "",   "bc"],
1206                     ["bc", "c", "3",  "bc"]]
1207         );
1208 
1209     /* Missing policies. */
1210     testSummarizer(["unittest-mis-1", "--header", "--group-by", "1", "--values", "1", "--exclude-missing"],
1211                    file1,
1212                    [["fld1", "fld1_values"],
1213                     ["a", "a|a"],
1214                     ["c", "c|c|c"],
1215                     ["",  ""]]
1216         );
1217     testSummarizer(["unittest-mis-2", "-H", "--group-by", "1", "--values", "2", "-x"],
1218                    file1,
1219                    [["fld1", "fld2_values"],
1220                     ["a", "a|c"],
1221                     ["c", "a|bc|bc"],
1222                     ["",  "bc"]]
1223         );
1224     testSummarizer(["unittest-mis-3", "-H", "-g", "1", "--values", "3", "-x"],
1225                    file1,
1226                    [["fld1", "fld3_values"],
1227                     ["a", "3|2b"],
1228                     ["c", "2b|3"],
1229                     ["",  ""]]
1230         );
1231     testSummarizer(["unittest-mis-4", "-H", "--group-by", "1", "--values", "1,2,3", "-x"],
1232                    file1,
1233                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1234                     ["a", "a|a",   "a|c",     "3|2b"],
1235                     ["c", "c|c|c", "a|bc|bc", "2b|3"],
1236                     ["",  "",      "bc",      ""]]
1237         );
1238     testSummarizer(["unittest-mis-5", "--header", "--group-by", "1", "--values", "1", "--replace-missing", "NA"],
1239                    file1,
1240                    [["fld1", "fld1_values"],
1241                     ["a", "a|a"],
1242                     ["c", "c|c|c"],
1243                     ["",  "NA"]]
1244         );
1245     testSummarizer(["unittest-mis-6", "-H", "--group-by", "1", "--values", "2", "-r", "NA"],
1246                    file1,
1247                    [["fld1", "fld2_values"],
1248                     ["a", "a|c"],
1249                     ["c", "a|bc|bc"],
1250                     ["",  "bc"]]
1251         );
1252     testSummarizer(["unittest-mis-7", "-H", "-g", "1", "--values", "3", "-r", "NA"],
1253                    file1,
1254                    [["fld1", "fld3_values"],
1255                     ["a", "3|2b"],
1256                     ["c", "2b|NA|3"],
1257                     ["",  "NA"]]
1258         );
1259     testSummarizer(["unittest-mis-8", "-H", "--group-by", "1", "--values", "1,2,3", "-r", "NA"],
1260                    file1,
1261                    [["fld1", "fld1_values", "fld2_values", "fld3_values"],
1262                     ["a", "a|a",   "a|c",     "3|2b"],
1263                     ["c", "c|c|c", "a|bc|bc", "2b|NA|3"],
1264                     ["",  "NA",      "bc",      "NA"]]
1265         );
1266     testSummarizer(["unittest-mis-9", "-H", "--group-by", "1,2", "--values", "3,1", "-x"],
1267                    file1,
1268                    [["fld1", "fld2", "fld3_values", "fld1_values"],
1269                     ["a", "a",  "3", "a"],
1270                     ["c", "a",  "2b", "c"],
1271                     ["c", "bc", "3", "c|c"],
1272                     ["a", "c",  "2b", "a"],
1273                     ["",  "bc", "",   ""]]
1274         );
1275     testSummarizer(["unittest-mis-10", "-H", "--group-by", "3,2", "--values", "1", "-x"],
1276                    file1,
1277                    [["fld3", "fld2", "fld1_values"],
1278                     ["3",  "a",  "a"],
1279                     ["2b", "a",  "c"],
1280                     ["",   "bc", "c"],
1281                     ["2b", "c",  "a"],
1282                     ["3",  "bc", "c"]]
1283         );
1284     testSummarizer(["unittest-mis-11", "-H", "--group-by", "2,1,3", "--values", "2", "-x"],
1285                    file1,
1286                    [["fld2", "fld1", "fld3", "fld2_values"],
1287                     ["a",  "a", "3",  "a"],
1288                     ["a",  "c", "2b", "a"],
1289                     ["bc", "c", "",   "bc"],
1290                     ["c",  "a", "2b", "c"],
1291                     ["bc", "",  "",   "bc"],
1292                     ["bc", "c", "3",  "bc"]]
1293         );
1294     testSummarizer(["unittest-mis-12", "-H", "--group-by", "1,2", "--values", "3,1", "-r", "NA"],
1295                    file1,
1296                    [["fld1", "fld2", "fld3_values", "fld1_values"],
1297                     ["a", "a",  "3", "a"],
1298                     ["c", "a",  "2b", "c"],
1299                     ["c", "bc", "NA|3", "c|c"],
1300                     ["a", "c",  "2b", "a"],
1301                     ["",  "bc", "NA",   "NA"]]
1302         );
1303     testSummarizer(["unittest-mis-13", "-H", "--group-by", "3,2", "--values", "1", "-r", "NA"],
1304                    file1,
1305                    [["fld3", "fld2", "fld1_values"],
1306                     ["3",  "a",  "a"],
1307                     ["2b", "a",  "c"],
1308                     ["",   "bc", "c|NA"],
1309                     ["2b", "c",  "a"],
1310                     ["3",  "bc", "c"]]
1311         );
1312     testSummarizer(["unittest-mis-14", "-H", "--group-by", "2,1,3", "--values", "2", "-r", "NA"],
1313                    file1,
1314                    [["fld2", "fld1", "fld3", "fld2_values"],
1315                     ["a",  "a", "3",  "a"],
1316                     ["a",  "c", "2b", "a"],
1317                     ["bc", "c", "",   "bc"],
1318                     ["c",  "a", "2b", "c"],
1319                     ["bc", "",  "",   "bc"],
1320                     ["bc", "c", "3",  "bc"]]
1321         );
1322 
1323     /* Validate that the no-key summarizer works with testSummarizer helper function.
1324      */
1325     testSummarizer(["unittest-nk-1", "-H", "--values", "1,2"],
1326                    file1,
1327                    [["fld1_values", "fld2_values"],
1328                     ["a|c|c|a||c", "a|a|bc|c|bc|bc"]]
1329         );
1330 
1331     /* Header variations: no header line; auto-generated header line; custom headers.
1332      */
1333     testSummarizer(["unittest-hdr-1", "--group-by", "1", "--values", "1"],
1334                    file1[1..$],
1335                    [["a", "a|a"],
1336                     ["c", "c|c|c"],
1337                     ["",  ""]]
1338         );
1339     testSummarizer(["unittest-hdr-2", "--group-by", "1,2", "--values", "2"],
1340                    file1[1..$],
1341                    [["a", "a",  "a"],
1342                     ["c", "a",  "a"],
1343                     ["c", "bc", "bc|bc"],
1344                     ["a", "c",  "c"],
1345                     ["", "bc",  "bc"]]
1346         );
1347     testSummarizer(["unittest-hdr-3", "--write-header", "--group-by", "2", "--values", "1"],
1348                    file1[1..$],
1349                    [["field2", "field1_values"],
1350                     ["a",  "a|c"],
1351                     ["bc", "c||c"],
1352                     ["c",  "a"]]
1353         );
1354     testSummarizer(["unittest-hdr-4", "-w", "--group-by", "3,2", "--values", "1"],
1355                    file1[1..$],
1356                    [["field3", "field2", "field1_values"],
1357                     ["3",  "a",  "a"],
1358                     ["2b", "a",  "c"],
1359                     ["",   "bc", "c|"],
1360                     ["2b", "c",  "a"],
1361                     ["3",  "bc", "c"]]
1362         );
1363     testSummarizer(["unittest-hdr-5", "-H", "--group-by", "2", "--values", "3:Field3Values"],
1364                    file1,
1365                    [["fld2", "Field3Values"],
1366                     ["a",  "3|2b"],
1367                     ["bc", "||3"],
1368                     ["c",  "2b"]]
1369         );
1370     testSummarizer(["unittest-hdr-6", "-H", "--group-by", "1,2", "--values", "3:FieldThreeValues", "--values", "1:FieldOneValues"],
1371                    file1,
1372                    [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"],
1373                     ["a", "a",  "3", "a"],
1374                     ["c", "a",  "2b", "c"],
1375                     ["c", "bc", "|3", "c|c"],
1376                     ["a", "c",  "2b", "a"],
1377                     ["",  "bc", "",   ""]]
1378         );
1379     testSummarizer(["unittest-hdr-7", "--write-header", "--group-by", "1", "--values", "3:f3_vals","--values", "2:f2_vals", "--values", "1:f1_vals"],
1380                    file1[1..$],
1381                    [["field1", "f3_vals", "f2_vals", "f1_vals"],
1382                     ["a", "3|2b",  "a|c",     "a|a"],
1383                     ["c", "2b||3", "a|bc|bc", "c|c|c"],
1384                     ["",  "",      "bc",      ""]]
1385         );
1386     testSummarizer(["unittest-hdr-8", "--write-header", "--group-by", "1,3,2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2"],
1387                    file1[1..$],
1388                    [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"],
1389                     ["a", "3",  "a",  "3",  "a", "a"],
1390                     ["c", "2b", "a",  "2b", "c", "a"],
1391                     ["c", "",   "bc", "",   "c", "bc"],
1392                     ["a", "2b", "c",  "2b", "a", "c"],
1393                     ["",  "",   "bc", "",   "",  "bc"],
1394                     ["c", "3",  "bc", "3",  "c", "bc"]]
1395         );
1396     testSummarizer(["unittest-hdr-9", "--write-header", "--group-by", "1,3-2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2"],
1397                    file1[1..$],
1398                    [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"],
1399                     ["a", "3",  "a",  "3",  "a", "a"],
1400                     ["c", "2b", "a",  "2b", "c", "a"],
1401                     ["c", "",   "bc", "",   "c", "bc"],
1402                     ["a", "2b", "c",  "2b", "a", "c"],
1403                     ["",  "",   "bc", "",   "",  "bc"],
1404                     ["c", "3",  "bc", "3",  "c", "bc"]]
1405         );
1406 
1407     /* Alternate file widths and lengths.
1408      */
1409 
1410     auto file3x2 = [["fld1", "fld2", "fld3"],
1411                     ["a", "b", "c"],
1412                     ["c", "b", "a"]];
1413 
1414     testSummarizer(["unittest-3x2-1", "-H", "--group-by", "1", "--values", "3"],
1415                    file3x2,
1416                    [["fld1", "fld3_values"],
1417                     ["a", "c"],
1418                     ["c", "a"]]
1419         );
1420     testSummarizer(["unittest-3x2-2", "-H", "--group-by", "2", "--values", "3"],
1421                    file3x2,
1422                    [["fld2", "fld3_values"],
1423                     ["b", "c|a"]]
1424         );
1425     testSummarizer(["unittest-3x2-3", "-H", "--group-by", "2,1", "--values", "3"],
1426                    file3x2,
1427                    [["fld2", "fld1", "fld3_values"],
1428                     ["b", "a", "c"],
1429                     ["b", "c", "a"]]
1430         );
1431 
1432     auto file3x1 = [["fld1", "fld2", "fld3"],
1433                     ["a", "b", "c"]];
1434 
1435     testSummarizer(["unittest-3x1-1", "-H", "--group-by", "1", "--values", "3"],
1436                    file3x1,
1437                    [["fld1", "fld3_values"],
1438                     ["a", "c"]]
1439         );
1440     testSummarizer(["unittest-3x1-2", "--group-by", "1", "--values", "3"],
1441                    file3x1[1..$],
1442                    [["a", "c"]]
1443         );
1444     testSummarizer(["unittest-3x1-3", "-H", "--group-by", "2,1", "--values", "3"],
1445                    file3x1,
1446                    [["fld2", "fld1", "fld3_values"],
1447                     ["b", "a", "c"]]
1448         );
1449     testSummarizer(["unittest-3x1-4", "--group-by", "2,1", "--values", "3"],
1450                    file3x1[1..$],
1451                    [["b", "a", "c"]]
1452         );
1453 
1454     auto file3x0 = [["fld1", "fld2", "fld3"]];
1455 
1456     testSummarizer(["unittest-3x0-1", "-H", "--group-by", "1", "--values", "3"],
1457                    file3x0,
1458                    [["fld1", "fld3_values"]]
1459         );
1460     testSummarizer(["unittest-3x0-2", "--group-by", "1", "--values", "3"],
1461                    file3x0[1..$],
1462                    []
1463         );
1464     testSummarizer(["unittest-3x0-3", "--write-header", "--group-by", "1", "--values", "3"],
1465                    file3x0[1..$],
1466                    [["field1", "field3_values"]]
1467         );
1468 
1469 
1470     testSummarizer(["unittest-3x0-4", "-H", "--group-by", "2,1", "--values", "3"],
1471                    file3x0,
1472                    [["fld2", "fld1", "fld3_values"]]
1473         );
1474 
1475     testSummarizer(["unittest-3x0-5", "--group-by", "2,1", "--values", "3"],
1476                    file3x0[1..$],
1477                    []
1478         );
1479 
1480     testSummarizer(["unittest-3x0-6", "--write-header", "--group-by", "2,1", "--values", "3"],
1481                    file3x0[1..$],
1482                    [["field2", "field1", "field3_values"]]
1483         );
1484 
1485     auto file2x1 = [["fld1", "fld2"],
1486                     ["a", "b"]];
1487 
1488     testSummarizer(["unittest-2x1-1", "-H", "--group-by", "1", "--values", "2"],
1489                    file2x1,
1490                    [["fld1", "fld2_values"],
1491                     ["a", "b"]]
1492         );
1493     testSummarizer(["unittest-2x1-2", "-H", "--group-by", "2,1", "--values", "1"],
1494                    file2x1,
1495                    [["fld2", "fld1", "fld1_values"],
1496                     ["b", "a", "a"]]
1497         );
1498 
1499     auto file2x0 = [["fld1", "fld2"]];
1500 
1501     testSummarizer(["unittest-2x0-1", "-H", "--group-by", "1", "--values", "2"],
1502                    file2x0,
1503                    [["fld1", "fld2_values"]]
1504         );
1505     testSummarizer(["unittest-2x0-2", "-H", "--group-by", "2,1", "--values", "1"],
1506                    file2x0,
1507                    [["fld2", "fld1", "fld1_values"]]
1508         );
1509 
1510     auto file1x2 = [["fld1"],
1511                     ["a"],
1512                     [""]];
1513 
1514     testSummarizer(["unittest-1x2-1", "-H", "--group-by", "1", "--values", "1"],
1515                    file1x2,
1516                    [["fld1", "fld1_values"],
1517                     ["a", "a"],
1518                     ["",  ""]]
1519         );
1520 
1521     auto file1x2b = [["fld1"],
1522                      [""],
1523                      [""]];
1524 
1525     testSummarizer(["unittest-1x2b-2", "-H", "--group-by", "1", "--values", "1"],
1526                    file1x2b,
1527                    [["fld1", "fld1_values"],
1528                     ["", "|"]]
1529         );
1530 
1531     auto file1x1 = [["fld1"],
1532                     ["x"]];
1533 
1534     testSummarizer(["unittest-1x1-1", "-H", "--group-by", "1", "--values", "1"],
1535                    file1x1,
1536                    [["fld1", "fld1_values"],
1537                     ["x", "x"]]
1538         );
1539 
1540     testSummarizer(["unittest-1x1-2", "--group-by", "1", "--values", "1"],
1541                    file1x1[1..$],
1542                    [["x", "x"]]
1543         );
1544 
1545     testSummarizer(["unittest-1x1-3", "--write-header", "--group-by", "1", "--values", "1"],
1546                    file1x1[1..$],
1547                    [["field1", "field1_values"],
1548                     ["x", "x"]]
1549         );
1550 
1551     auto file1x1b = [["fld1"],
1552                     [""]];
1553 
1554     testSummarizer(["unittest-1x1b-1", "-H", "--group-by", "1", "--values", "1"],
1555                    file1x1b,
1556                    [["fld1", "fld1_values"],
1557                     ["", ""]]
1558         );
1559 
1560     auto file1x0 = [["fld1"]];
1561 
1562     testSummarizer(["unittest-1x0-1", "-H", "--group-by", "1", "--values", "1"],
1563                    file1x0,
1564                    [["fld1", "fld1_values"]]
1565         );
1566 
1567     testSummarizer(["unittest-1x0-2", "--group-by", "1", "--values", "1"],
1568                    file1x0[1..$],
1569                    []
1570         );
1571 
1572     testSummarizer(["unittest-1x0-3", "--write-header", "--group-by", "1", "--values", "1"],
1573                    file1x0[1..$],
1574                    [["field1", "field1_values"]]
1575         );
1576 
1577     /* Alternate delimiters. */
1578     testSummarizer(["unittest-delim-1", "-H", "--values", "1,2", "--delimiter", "%"],
1579                    file1,
1580                    [["fld1_values", "fld2_values"],
1581                     ["a|c|c|a||c", "a|a|bc|c|bc|bc"]]
1582         );
1583     testSummarizer(["unittest-delim-2", "-H", "--values", "1-2", "--values-delimiter", "$"],
1584                    file1,
1585                    [["fld1_values", "fld2_values"],
1586                     ["a$c$c$a$$c", "a$a$bc$c$bc$bc"]]
1587         );
1588     testSummarizer(["unittest-delim-3", "-H", "--values", "1,2", "--delimiter", "#", "--values-delimiter", ","],
1589                    file1,
1590                    [["fld1_values", "fld2_values"],
1591                     ["a,c,c,a,,c", "a,a,bc,c,bc,bc"]]
1592         );
1593     testSummarizer(["unittest-delim-4", "--write-header", "--group-by", "2", "--values", "1",
1594                     "--delimiter", "^", "--values-delimiter", ":"],
1595                    file1[1..$],
1596                    [["field2", "field1_values"],
1597                     ["a",  "a:c"],
1598                     ["bc", "c::c"],
1599                     ["c",  "a"]]
1600         );
1601     testSummarizer(["unittest-delim-5", "--group-by", "1,2", "--values", "2", "--delimiter", "/",
1602                     "--values-delimiter", "\\"],
1603                    file1[1..$],
1604                    [["a", "a",  "a"],
1605                     ["c", "a",  "a"],
1606                     ["c", "bc", "bc\\bc"],
1607                     ["a", "c",  "c"],
1608                     ["", "bc",  "bc"]]
1609         );
1610 }
1611 
1612 /* Summary Operators and Calculators
1613  *
1614  * Two types of objects are used in implementation: Operators and Calculators. An Operator
1615  * represents a summary calculation specified on the command line, e.g. '--mean 5'. A
1616  * Calculator is used to manage the summary calculation for each unique key in the input.
1617  *
1618  * As an example, consider the command:
1619  *
1620  *    $tsv-summarize --group-by 1 --mean 3 --mean 5
1621  *
1622  * This command will create two instances of a MeanOperator, one each for fields 3 and 5.
1623  * They produce the output field headers (e.g. "field3_mean", "field5_mean"). They also
1624  * create MeanCalculator objects for each unique value in field 1. For 'mean', a
1625  * calculator needs to track occurrence count and sum. Calculators produce the final
1626  * value when all processing is finished.
1627  *
1628  * Summary field headers
1629  *
1630  * There are several options for specifying summary field headers. The defaults combine the
1631  * operator name and the header of the field summarized. The defaults can be overridden on
1632  * on the command line. These scenarios are supported via the operator constructor and the
1633  * processHeaderLine() method.
1634  *
1635  * Missing field policy
1636  *
1637  * At present, tsv-summarize has a single policy for handling missing values that applies
1638  * to all operators. However, it is logically operator specific and is implemented that
1639  * way. The MissingFieldPolicy struct describes the policy, each operator contains one.
1640  * Calculators access thier operator's policy struct.
1641  */
1642 
1643 /** An Operator represents a summary calculation specified on the command line.
1644  *  e.g. '--mean 5'.
1645  */
1646 interface Operator
1647 {
1648     @property string header();
1649     @property string name();
1650     void processHeaderLine(const char[][] fields);
1651     size_t[] numericFieldsToSave();     // Numeric fields this Operator needs saved
1652     size_t[] textFieldsToSave();        // Text fields this Operator needs saved
1653     Calculator makeCalculator();
1654 }
1655 
1656 /** Calculators are responsible for the calculation of a single computation. They
1657  *  process each line and produce the final value when all processing is finished.
1658  */
1659 interface Calculator
1660 {
1661     void processNextLine(const char[][] fields);
1662     string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions);
1663 }
1664 
1665 /** This class describes processing behavior when a missing value is encountered.
1666  */
1667 class MissingFieldPolicy
1668 {
1669     private bool _useMissing = true;          // True if missing values are processed unchanged.
1670     private bool _replaceMissing = false;     // True if missing values are replaced.
1671     private string _missingReplacement;       // Replacement string if replaceMissing is true.
1672 
1673     this (in bool excludeMissing = false, in string missingReplacement = "")
1674     {
1675         updatePolicy(excludeMissing, missingReplacement);
1676     }
1677 
1678     void updatePolicy(in bool excludeMissing, in string missingReplacement)
1679     {
1680         _missingReplacement = missingReplacement;
1681         _replaceMissing = missingReplacement.length != 0;
1682         _useMissing = !excludeMissing && !replaceMissing;
1683     }
1684 
1685     final bool isMissingField(const char[] field) const
1686     {
1687         return field.length == 0;
1688     }
1689 
1690     final bool useMissing() const @property
1691     {
1692         return _useMissing;
1693     }
1694 
1695     final bool excludeMissing() const @property
1696     {
1697         return !_useMissing && !_replaceMissing;
1698     }
1699 
1700     final bool replaceMissing() const @property
1701     {
1702         return _replaceMissing;
1703     }
1704 
1705     final string missingReplacement() const @property
1706     {
1707         return _missingReplacement;
1708     }
1709 }
1710 
1711 /* The SharedFieldValues and UniqueKeyValuesLists classes manage lists of values collected
1712  * while reading data. Operations like median collect all values and operate on them when
1713  * running the final calculation. Value lists are needed for each unique key. A command
1714  * using multiple Operators may save multiple fields. And, different Operators may be run
1715  * against the same field.
1716  *
1717  * The last part motivates these classes. Handling large data sets necessitates minimizing
1718  * in-memory storage, making it desirable to share identical lists between Calculators.
1719  * Otherwise, each Calculator could implement its own storage, which would be simpler.
1720  *
1721  * The setup works as follows:
1722  *  - Operators advertise fields they need saved ([text|numeric]FieldsToSave methods).
1723  *  - The SummarizerBase object keeps a SharedFieldValues object, which in turn keeps list
1724  *    of the fields advertised by Operators as needing sharing. This list gets created
1725  *    during command initialization (SummarizerBase.setOperators).
1726  *  - The SharedFieldValues object is used to create a UniqueKeyValuesLists object every
1727  *    time a new unique key is found, in parellel to the Calculator objects created for the
1728  *    key. The UniqueKeyValuesLists objects are managed by derived Summarizer classes.
1729  *  - A unique key's UniqueKeyValuesLists object is passed each input line, same as
1730  *    Calculators, saving the values.
1731  *  - Calculators retrieve the saved values during the calculation phase. The calculator's
1732  *    ProcessNextField method is typically a no-op.
1733  *  - Calculators cannot make assumptions about the order of the saved values. This is
1734  *    pragmatic concession to median and quantile calculations, which need to sort the data,
1735  *    at least partially. Rather than generate sorted copies, the current algorithms
1736  *    sort the data in place.
1737  *
1738  * One concession to duplicate storage is that text and numeric versions of the same
1739  * field might be stored. The reason is because it's important to convert text to numbers
1740  * as they are read so that useful error messages can be generated. And, storing both
1741  * forms of the same field should be less common.
1742  *
1743  * The current implementation uses the same missing values policy for all fields. If
1744  * multiple policies become supported this will need to change.
1745  *
1746  * Built-in calculations - UniqueKeyValueLists have a built-in median operation. This is
1747  * to avoid repeated calculations of the median by different calculations.
1748  */
1749 
1750 class SharedFieldValues
1751 {
1752     // Arrays with field indices that need to be saved.
1753     private size_t[] _numericFieldIndices;
1754     private size_t[] _textFieldIndices;
1755 
1756     /* Called during summarizer setup to add a shared field value for a specific field index.
1757      * eg. '--median 7' will add end up calling addNumericIdex(6), 6 being the zero-based index.
1758      * A specific index is only added once.
1759      */
1760     final void addNumericIndex (size_t index)
1761     {
1762         if (!canFind(_numericFieldIndices, index)) _numericFieldIndices ~= index;
1763     }
1764 
1765     /* Similar to addNumericIndex, except adds a text index. */
1766     final void addTextIndex (size_t index)
1767     {
1768         if (!canFind(_textFieldIndices, index)) _textFieldIndices ~= index;
1769     }
1770 
1771     /* Called every time a new key is found, or once at the beginning of the program if no keys
1772      * are being used (entire column summarized).
1773      */
1774     final UniqueKeyValuesLists makeUniqueKeyValuesLists()
1775     {
1776         return new UniqueKeyValuesLists(_numericFieldIndices, _textFieldIndices);
1777     }
1778 }
1779 
1780 class UniqueKeyValuesLists
1781 {
1782     /* A FieldValues object holds is a list of values collect for a specific field. A
1783      * unique key may hold several. For example, the command:
1784      *     $ tsv-summarize --k 1 --median 4 -- median 5
1785      * requires keeping lists for both fields 4 and 5. This in turn will result in a
1786      * _numericFieldValues being a 2 element array, one with a list of field 4 values,
1787      * the second of field 5 values. Linear search is used to find a specific field.
1788      */
1789     private FieldValues!double[] _numericFieldValues;
1790     private FieldValues!string[] _textFieldValues;
1791     private double[] _numericFieldMedians;
1792 
1793     /* The UniqueKeyValuesLists constructor takes arrays of field indices to be saved. */
1794     this(const size_t[] numericFieldIndices, const size_t[] textFieldIndices)
1795     {
1796         if (numericFieldIndices.length > 0)
1797         {
1798             _numericFieldValues = new FieldValues!double[](numericFieldIndices.length);
1799             foreach (i, fieldIndex; numericFieldIndices)
1800                 _numericFieldValues[i] = new FieldValues!double(fieldIndex);
1801         }
1802 
1803         if (textFieldIndices.length > 0)
1804         {
1805             _textFieldValues = new FieldValues!string[](textFieldIndices.length);
1806             foreach (i, fieldIndex; textFieldIndices)
1807                 _textFieldValues[i] = new FieldValues!string(fieldIndex);
1808         }
1809     }
1810 
1811     void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy)
1812     {
1813         _numericFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy));
1814         _textFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy));
1815     }
1816 
1817     private FieldValues!double findNumericFieldValues(size_t index)
1818     {
1819         alias pred = (FieldValues!double a, size_t b) => (a.fieldIndex == b);
1820         auto r = find!pred(_numericFieldValues, index);
1821         assert(!r.empty);
1822         return r.front;
1823     }
1824 
1825     private FieldValues!string findTextFieldValues(size_t index)
1826     {
1827         alias pred = (FieldValues!string a, size_t b) => (a.fieldIndex == b);
1828         auto r = find!pred(_textFieldValues, index);
1829         assert(!r.empty);
1830         return r.front;
1831     }
1832 
1833     final double[] numericValues(size_t index)
1834     {
1835         return findNumericFieldValues(index).getArray;
1836     }
1837 
1838     final double[] numericValuesSorted(size_t index)
1839     {
1840         return findNumericFieldValues(index).getSortedArray;
1841     }
1842 
1843     final string[] textValues(size_t index)
1844     {
1845         return findTextFieldValues(index).getArray;
1846     }
1847 
1848     final string[] textValuesSorted(size_t index)
1849     {
1850         return findTextFieldValues(index).getSortedArray;
1851     }
1852 
1853     final double numericValuesMedian(size_t index)
1854     {
1855         return findNumericFieldValues(index).median;
1856     }
1857 
1858     private class FieldValues(ValueType)
1859     {
1860         import std.array : appender;
1861         private size_t _fieldIndex;
1862         private Appender!(ValueType[]) _values;
1863         private bool _haveMedian = false;
1864         private bool _isSorted = false;
1865         private ValueType _medianValue;
1866 
1867         this(size_t fieldIndex)
1868         {
1869             _fieldIndex = fieldIndex;
1870         }
1871 
1872         final size_t length() const @property
1873         {
1874             return _values.data.length;
1875         }
1876 
1877         final size_t fieldIndex() const @property
1878         {
1879             return _fieldIndex;
1880         }
1881 
1882         final void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy)
1883         {
1884             debug writefln("[%s]: %s", __FUNCTION__, fields.to!string);
1885 
1886             const char[] field = fields[_fieldIndex];
1887             if (missingPolicy.useMissing || !missingPolicy.isMissingField(field))
1888             {
1889                 _values.put(field.to!ValueType);
1890                 _haveMedian = false;
1891                 _isSorted = false;
1892             }
1893             else if (missingPolicy.replaceMissing)
1894             {
1895                 _values.put(missingPolicy.missingReplacement.to!ValueType);
1896                 _haveMedian = false;
1897                 _isSorted = false;
1898             }
1899         }
1900 
1901         /* Return an input range of the values. */
1902         final auto values()
1903         {
1904             return _values.data;
1905         }
1906 
1907         final ValueType[] getArray()
1908         {
1909             return _values.data;
1910         }
1911 
1912         final ValueType[] getSortedArray()
1913         {
1914             if (!_isSorted)
1915             {
1916                 import std.algorithm : sort;
1917                 sort(_values.data);
1918                 _isSorted = true;
1919             }
1920             return _values.data;
1921         }
1922 
1923         final ValueType median()
1924         {
1925             if (!_haveMedian)
1926             {
1927                 import tsv_numerics : rangeMedian;
1928                 _medianValue = _values.data.rangeMedian();
1929                 _haveMedian = true;
1930             }
1931 
1932             return _medianValue;
1933         }
1934     }
1935 }
1936 
1937 /** SingleFieldOperator is a base class for single field operators, the most common
1938  * Operator. Derived classes implement makeCalculator and the Calculator class it returns.
1939  */
1940 class SingleFieldOperator : Operator
1941 {
1942     import std.typecons : Flag;
1943 
1944     private string _name;
1945     private string _header;
1946     private size_t _fieldIndex;
1947     private bool _useHeaderSuffix;
1948     private bool _allowCustomHeader;
1949     private bool _hasCustomHeader = false;
1950     private size_t[] _numericFieldsToSave;
1951     private size_t[] _textFieldsToSave;
1952     private MissingFieldPolicy _missingPolicy;
1953 
1954     this(string operatorName, size_t fieldIndex, MissingFieldPolicy missingPolicy,
1955          Flag!"useHeaderSuffix" useHeaderSuffix = Yes.useHeaderSuffix,
1956          Flag!"allowCustomHeader" allowCustomHeader = Yes.allowCustomHeader)
1957     {
1958         _name = operatorName;
1959         _fieldIndex = fieldIndex;
1960         _missingPolicy = missingPolicy;
1961         _useHeaderSuffix = useHeaderSuffix;
1962         _allowCustomHeader = allowCustomHeader;
1963         // Default header. May be overrridden by custom header or header line.
1964         _header =
1965             fieldHeaderFromIndex(fieldIndex)
1966             .summaryHeaderFromFieldHeader(_useHeaderSuffix ? operatorName : "");
1967     }
1968 
1969     void setCustomHeader (string customHeader)
1970     {
1971         assert(_allowCustomHeader);
1972         _header = customHeader;
1973         _hasCustomHeader = true;
1974     }
1975 
1976     final string name() const @property
1977     {
1978         return _name;
1979     }
1980 
1981     final bool allowCustomHeader() const @property
1982     {
1983         return _allowCustomHeader;
1984     }
1985 
1986     /* saveFieldValues[Numeric|Text] are called by derived classes to indicate that field
1987      * that the field values should be saved. These should called during construction.
1988      */
1989     final void setSaveFieldValuesNumeric()
1990     {
1991         _numericFieldsToSave ~= _fieldIndex;
1992     }
1993 
1994     final void setSaveFieldValuesText()
1995     {
1996         _textFieldsToSave ~= _fieldIndex;
1997     }
1998 
1999     final MissingFieldPolicy missingPolicy() @property
2000     {
2001         return _missingPolicy;
2002     }
2003 
2004     final size_t fieldIndex() const @property
2005     {
2006         return _fieldIndex;
2007     }
2008 
2009     final string header() const @property
2010     {
2011         return _header;
2012     }
2013 
2014     final bool useHeaderSuffix() const @property
2015     {
2016         return _useHeaderSuffix;
2017     }
2018 
2019     void processHeaderLine(const char[][] fields)
2020     {
2021         if (!_hasCustomHeader) {
2022             debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string);
2023             _header = summaryHeaderFromFieldHeader(fields[_fieldIndex].to!string,
2024                                                    _useHeaderSuffix ? _name : "");
2025         }
2026     }
2027 
2028     final size_t[] numericFieldsToSave()
2029     {
2030         return _numericFieldsToSave;
2031     }
2032 
2033     final size_t[] textFieldsToSave()
2034     {
2035         return _textFieldsToSave;
2036     }
2037 
2038     abstract SingleFieldCalculator makeCalculator();
2039 }
2040 
2041 /** SingleFieldCalculator is a base class for the common case of calculators using a single
2042  * field. Derived classes implement processNextField() rather than processNextLine().
2043  */
2044 class SingleFieldCalculator : Calculator
2045 {
2046     private size_t _fieldIndex;
2047 
2048     this(size_t fieldIndex)
2049     {
2050         _fieldIndex = fieldIndex;
2051     }
2052 
2053     final size_t fieldIndex() const @property
2054     {
2055         return _fieldIndex;
2056     }
2057 
2058     final void processNextLine(const char[][] fields)
2059     {
2060         debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string);
2061 
2062         auto missingPolicy = getOperator.missingPolicy;
2063         const char[] field = fields[_fieldIndex];
2064 
2065         if (missingPolicy.useMissing || !missingPolicy.isMissingField(field))
2066         {
2067             processNextField(field);
2068         }
2069         else if (missingPolicy.replaceMissing)
2070         {
2071             processNextField(missingPolicy.missingReplacement);
2072         }
2073     }
2074 
2075     abstract SingleFieldOperator getOperator();
2076 
2077     abstract void processNextField(const char[] field);
2078 }
2079 
2080 /* Unittest helper functions. Only compiled when -unittest is in effect. */
2081 version(unittest)
2082 {
2083     /** A helper for SingleFieldOperator unit tests.
2084      *
2085      * testSingleFieldOperator takes a set of split file values, a field index, a header
2086      * suffix, and a set of expected values. The expected values array contains the
2087      * initial value (zero entries) and the expected values after each line. (One more
2088      * expected value than input lines.) The zero entry case is what is generated for an
2089      * empty file. An example testing the 'min' operator against a file with 2 columns,
2090      * 3 rows, using field index 1:
2091      *
2092      *    testSingleFieldOperator!MinOperator(
2093      *       [["10", "100"],               // The split file. 3 lines by 2 rows.
2094      *        ["5", "50"],
2095      *        ["20", "200"]],
2096      *       1,                            // Field index (zero-based, so "100", "50", "200")
2097      *       "min",                        // The header suffix, normally the operator name.
2098      *       ["nan", "100", "50", "50"]);  // Min value after processing each line.
2099      *
2100      * A typical operator unit test uses three "files", one each of 1x3, 2x3, and 3x3.
2101      * Then run the operator is tested against each column, a total of six calls. Headers
2102      * are automatically checked. Additional entries can be used to extend coverage.
2103      *
2104      * A non-default MissingFieldPolicy can be provide as an optional last argument.
2105      * Operator tests should include exclusion and replacement variations. See operator
2106      * unit tests for details.
2107      *
2108      * The testSingleFieldOperatorBase adds an additional capability - Custom operator
2109      * init arguments. Currently this is used only by the quantile operator.
2110      *
2111      * These tests do not check unique key behavior (group-by). Operators don't have info
2112      * about unique keys, and interact with them only indirectly, via Calculators.
2113      */
2114     void testSingleFieldOperator(OperatorClass : SingleFieldOperator)
2115         (const char[][][] splitFile, size_t fieldIndex, string headerSuffix,
2116          const char[][] expectedValues,
2117          MissingFieldPolicy missingPolicy = new MissingFieldPolicy)
2118     {
2119         testSingleFieldOperatorBase!OperatorClass(splitFile, fieldIndex, headerSuffix, expectedValues, missingPolicy);
2120     }
2121 
2122     void testSingleFieldOperatorBase(OperatorClass : SingleFieldOperator, T...)
2123         (const char[][][] splitFile, size_t fieldIndex, string headerSuffix,
2124          const char[][] expectedValues,
2125          MissingFieldPolicy missingPolicy,
2126          T extraOpInitArgs)
2127     {
2128         import std.format : format;
2129         import std.array : appender;
2130         import std..string : chomp;
2131         import std.traits : EnumMembers;
2132 
2133         auto numFields = (splitFile[0]).length;
2134 
2135         assert(fieldIndex < numFields,
2136                format("[testSingleFieldOperator] Invalid field index. headerSuffix: %s",
2137                       headerSuffix));
2138         assert(splitFile.length + 1 == expectedValues.length,
2139                format("[testSingleFieldOperator] Need one more expected value than number of rows. headerSuffix: %s",
2140                       headerSuffix));
2141 
2142         /* printOptions - Only the 'values-delimiter' (2nd arg) is used these tests. */
2143         auto printOptions = SummarizerPrintOptions('#', '|');
2144 
2145         /* An input header line. */
2146         string[] inputHeaderLine = new string[numFields];
2147         foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string;
2148 
2149         /* The different expected output field headers. */
2150         auto outputFieldHeaderWithNoHeaderLine =
2151             fieldHeaderFromIndex(fieldIndex)
2152             .summaryHeaderFromFieldHeader(headerSuffix);
2153         auto outputFieldHeaderFromHeaderLine =
2154             inputHeaderLine[fieldIndex]
2155             .summaryHeaderFromFieldHeader(headerSuffix);
2156         auto customOutputFieldHeader = "custom";
2157 
2158         enum HeaderUsecase {
2159             HeaderLine_DefaultHeader,
2160             HeaderLine_CustomHeader,
2161             NoHeaderLine_DefaultHeader,
2162             NoHeaderLine_CustomHeader,
2163             NoHeaderLine_NoOutputHeader,
2164         }
2165 
2166         string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected)
2167         {
2168             return format("[testSingleFieldOperator] Unexpected header. Operator: %s; Usecase: %s;  Actual: '%s';  Expected: '%s'",
2169                           op.name, hc, actual, expected);
2170         }
2171 
2172         string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex, size_t fieldIndex,
2173                                   const char[] actual, const char[] expected)
2174         {
2175             return format("[testSingleFieldOperator] Unexpected value. Operator: %s; Usecase: %s;  RowIndex: %d, FieldIndex: %d\n    Actual: '%s';  Expected: '%s'",
2176                           op.name, hc, rowIndex, fieldIndex, actual, expected);
2177         }
2178 
2179         /* Run the logic for each header use case. */
2180         foreach (hc; EnumMembers!HeaderUsecase)
2181         {
2182             bool hasInputHeader = (
2183                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2184                 hc == HeaderUsecase.HeaderLine_CustomHeader
2185                 );
2186             bool hasOutputHeader = (
2187                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2188                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2189                 hc == HeaderUsecase.NoHeaderLine_DefaultHeader ||
2190                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2191                 );
2192             bool hasCustomHeader = (
2193                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2194                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2195                 );
2196 
2197             if (hasCustomHeader) assert(hasOutputHeader);
2198 
2199             auto op = new OperatorClass(fieldIndex, missingPolicy, extraOpInitArgs);
2200 
2201             if (hasCustomHeader)
2202             {
2203                 if (!op.allowCustomHeader) continue;   // Custom header not support by this operator
2204                 op.setCustomHeader(customOutputFieldHeader);
2205             }
2206 
2207             Operator[] operatorArray;
2208             operatorArray ~= op;
2209 
2210             auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy);
2211             summarizer.setOperators(inputRangeObject(operatorArray));
2212 
2213             if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine);
2214 
2215             if (hasOutputHeader)
2216             {
2217                 /* Write the header line. Note that this is a one-field header, */
2218                 auto headerLineOutput = appender!(char[])();
2219                 summarizer.writeSummaryHeader(headerLineOutput, printOptions);
2220 
2221                 /* Test that the header was generated correctly.
2222                  *
2223                  * Note: Because the output is generated by a Summarizer, it will have a
2224                  * trailing newline. Use chomp to trim it.
2225                  */
2226                 final switch (hc)
2227                 {
2228                 case HeaderUsecase.HeaderLine_DefaultHeader:
2229                     assert(headerLineOutput.data.chomp == outputFieldHeaderFromHeaderLine,
2230                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2231                                                outputFieldHeaderFromHeaderLine));
2232                     break;
2233                 case HeaderUsecase.NoHeaderLine_DefaultHeader:
2234                     assert(headerLineOutput.data.chomp == outputFieldHeaderWithNoHeaderLine,
2235                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2236                                                outputFieldHeaderWithNoHeaderLine));
2237                     break;
2238                 case HeaderUsecase.HeaderLine_CustomHeader:
2239                 case HeaderUsecase.NoHeaderLine_CustomHeader:
2240                     assert(headerLineOutput.data.chomp == customOutputFieldHeader,
2241                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2242                                                customOutputFieldHeader));
2243                     break;
2244                 case HeaderUsecase.NoHeaderLine_NoOutputHeader:
2245                     break;
2246                }
2247 
2248             }
2249 
2250             /* For each line, process the line, generate the output, and test that the
2251              * value is correct. Start with the empty file case.
2252              */
2253             foreach (i, const char[] expected; expectedValues)
2254             {
2255                 if (i > 0) summarizer.processNextLine(splitFile[i - 1]);
2256                 auto summaryLineOutput = appender!(char[])();
2257                 summarizer.writeSummaryBody(summaryLineOutput, printOptions);
2258                 assert(summaryLineOutput.data.chomp == expected,
2259                        valueAssertMessage(operatorArray[0], hc, i, fieldIndex,
2260                                           summaryLineOutput.data.chomp, expectedValues[i]));
2261             }
2262         }
2263     }
2264 }
2265 
2266 /** ZeroFieldOperator is a base class for operators that take no input. The main use
2267  * case is the CountOperator, which counts the occurrences of each unique key. Other
2268  * uses are possible, for example, weighted random number assignment.
2269  *
2270  * The primary rationale for ZeroFieldOperator and ZeroFieldCalculator is to clarify
2271  * the information available to such a routine. In particular, the split fields passed
2272  * to processHeaderLine and processNextLine don't include all fields in the input,
2273  * something that might not be obvious when implementing an operator. (Only fields
2274  * required by operators acting on specific fields are included.)
2275  */
2276 class ZeroFieldOperator : Operator
2277 {
2278     import std.typecons : Flag;
2279 
2280     private string _name;
2281     private string _header;
2282 
2283     this(string operatorName)
2284     {
2285         _name = operatorName;
2286         _header = operatorName;
2287     }
2288 
2289     void setCustomHeader (string customHeader)
2290     {
2291         _header = customHeader;
2292     }
2293 
2294     bool allowCustomHeader() const @property
2295     {
2296         return true;
2297     }
2298 
2299     final string name() const @property
2300     {
2301         return _name;
2302     }
2303 
2304     final string header() const @property
2305     {
2306         return _header;
2307     }
2308 
2309     /* A no-op. ZeroFieldOperators have no access to the header line. */
2310     final void processHeaderLine(const char[][] fields) { }
2311 
2312     /* A no-op. ZeroFieldOperators have no access to fields. */
2313     final size_t[] numericFieldsToSave()
2314     {
2315         size_t[] emptyArray;
2316         return emptyArray;
2317     }
2318 
2319     /* A no-op. ZeroFieldOperators have no access to fields. */
2320     final size_t[] textFieldsToSave()
2321     {
2322         size_t[] emptyArray;
2323         return emptyArray;
2324     }
2325 
2326     abstract ZeroFieldCalculator makeCalculator();
2327 }
2328 
2329 /** ZeroFieldCalculator is a base class for operators that don't use fields as input.
2330  * In particular, the Count operator. It is a companion to the ZeroFieldOperator class.
2331  *
2332  * Derived classes implement processNextEntry() rather than processNextLine(), and the
2333  * single argument form of calculate() given as an abstract function.
2334  */
2335 class ZeroFieldCalculator : Calculator
2336 {
2337     this() { }
2338 
2339     final void processNextLine(const char[][] fields)
2340     {
2341         debug writefln("[%s]", __FUNCTION__,);
2342         processNextEntry();
2343     }
2344 
2345     final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2346     {
2347         return calculate(printOptions);
2348     }
2349 
2350     abstract void processNextEntry();
2351     abstract string calculate(const ref SummarizerPrintOptions printOptions);
2352 }
2353 
2354 version(unittest)
2355 {
2356     /* A helper for ZeroFieldOperator unit tests.
2357      *
2358      * testZeroFieldOperator takes a set of split file values, a default header, and a
2359      * set of expected values. The expected values array contains the expected values
2360      * after each line.
2361      *
2362      * testZeroFieldOperator is very similar to testSingleFieldOperator, except that
2363      * there is no use of field indices and fewer types of headers. See the latter's
2364      * documentation and the CountOperator unit tests for examples.
2365      */
2366     void testZeroFieldOperator(OperatorClass : ZeroFieldOperator)
2367         (const char[][][] splitFile, string defaultHeader, const char[][] expectedValues)
2368     {
2369         import std.format : format;
2370         import std.array : appender;
2371         import std..string : chomp;
2372         import std.traits : EnumMembers;
2373 
2374         auto numFields = (splitFile[0]).length;
2375 
2376         assert(splitFile.length + 1 == expectedValues.length,
2377                format("[testZeroFieldOperator] Need one more expected value than number of rows. headerSuffix: %s",
2378                       defaultHeader));
2379 
2380         /* printOptions - Not used these tests, but needed for API calls. */
2381         auto printOptions = SummarizerPrintOptions('#', '|');
2382 
2383         /* Missing policy doesn't apply to zero field operators, but need the object for the summarizer. */
2384         auto missingPolicy = new MissingFieldPolicy;
2385 
2386         /* An input header line. */
2387         string[] inputHeaderLine = new string[numFields];
2388         foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string;
2389 
2390         auto customOutputFieldHeader = "custom";
2391 
2392         enum HeaderUsecase {
2393             HeaderLine_DefaultHeader,
2394             HeaderLine_CustomHeader,
2395             NoHeaderLine_DefaultHeader,
2396             NoHeaderLine_CustomHeader,
2397             NoHeaderLine_NoOutputHeader,
2398         }
2399 
2400         string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected)
2401         {
2402             return format("[testZeroFieldOperator] Unexpected header. Operator: %s; Usecase: %s;  Actual: '%s';  Expected: '%s'",
2403                           op.name, hc, actual, expected);
2404         }
2405 
2406         string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex,
2407                                   const char[] actual, const char[] expected)
2408         {
2409             return format("[testZeroFieldOperator] Unexpected value. Operator: %s; Usecase: %s;  RowIndex: %d\n    Actual: '%s';  Expected: '%s'",
2410                           op.name, hc, rowIndex, actual, expected);
2411         }
2412 
2413         /* Run the logic for each header use case. */
2414         foreach (hc; EnumMembers!HeaderUsecase)
2415         {
2416             bool hasInputHeader = (
2417                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2418                 hc == HeaderUsecase.HeaderLine_CustomHeader
2419                 );
2420             bool hasOutputHeader = (
2421                 hc == HeaderUsecase.HeaderLine_DefaultHeader ||
2422                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2423                 hc == HeaderUsecase.NoHeaderLine_DefaultHeader ||
2424                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2425                 );
2426             bool hasCustomHeader = (
2427                 hc == HeaderUsecase.HeaderLine_CustomHeader ||
2428                 hc == HeaderUsecase.NoHeaderLine_CustomHeader
2429                 );
2430 
2431             if (hasCustomHeader) assert(hasOutputHeader);
2432 
2433             auto op = new OperatorClass();
2434 
2435             if (hasCustomHeader)
2436             {
2437                 if (!op.allowCustomHeader) continue;   // Custom header not support by this operator
2438                 op.setCustomHeader(customOutputFieldHeader);
2439             }
2440 
2441             Operator[] operatorArray;
2442             operatorArray ~= op;
2443 
2444             auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy);
2445             summarizer.setOperators(inputRangeObject(operatorArray));
2446             if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine);
2447 
2448             if (hasOutputHeader)
2449             {
2450                 /* Write the header line. Note that this is a one-field header, */
2451                 auto headerLineOutput = appender!(char[])();
2452                 summarizer.writeSummaryHeader(headerLineOutput, printOptions);
2453 
2454                 /* Test that the header was generated correctly.
2455                  *
2456                  * Note: Because the output is generated by a Summarizer, it will have a
2457                  * trailing newline. Use chomp to trim it.
2458                  */
2459                 final switch (hc)
2460                 {
2461                 case HeaderUsecase.HeaderLine_DefaultHeader:
2462                 case HeaderUsecase.NoHeaderLine_DefaultHeader:
2463                     assert(headerLineOutput.data.chomp == defaultHeader,
2464                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2465                                                defaultHeader));
2466                     break;
2467                 case HeaderUsecase.HeaderLine_CustomHeader:
2468                 case HeaderUsecase.NoHeaderLine_CustomHeader:
2469                     assert(headerLineOutput.data.chomp == customOutputFieldHeader,
2470                            headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp,
2471                                                customOutputFieldHeader));
2472                     break;
2473                 case HeaderUsecase.NoHeaderLine_NoOutputHeader:
2474                     break;
2475                 }
2476 
2477             }
2478 
2479             /* For each line, process the line, generate the output, and test that the
2480              * value is correct. Start with the empty file case.
2481              */
2482             foreach (i, const char[] expected; expectedValues)
2483             {
2484                 if (i > 0) summarizer.processNextLine(splitFile[i - 1]);
2485                 auto summaryLineOutput = appender!(char[])();
2486                 summarizer.writeSummaryBody(summaryLineOutput, printOptions);
2487                 assert(summaryLineOutput.data.chomp == expected,
2488                        valueAssertMessage(operatorArray[0], hc, i,
2489                                           summaryLineOutput.data.chomp, expectedValues[i]));
2490             }
2491         }
2492     }
2493 }
2494 
2495 /* Specific operators.
2496  *
2497  * Notes:
2498  * - The 'Calculator' inner classes are 'static'. This means inner class instances do not
2499  *   keep a reference to the context of the outer class. In exchange, Calculator instances
2500  *   need to hold all needed state, typically the field index they are summarizing.
2501  */
2502 
2503 /** CountOperator counts the number of occurrences of each unique key, or the number of
2504  * input lines if there is no unique key.
2505  *
2506  * CountOperator differs from most other operators in that it doesn't summarize a specific
2507  * field on the line. Instead it is summarizing a property of the unique key itself. For
2508  * this reason it doesn't derive from SingleFieldOperator.
2509  */
2510 class CountOperator : ZeroFieldOperator
2511 {
2512     this()
2513     {
2514         super("count");
2515     }
2516 
2517     final override ZeroFieldCalculator makeCalculator()
2518     {
2519         return new CountCalculator();
2520     }
2521 
2522     static class CountCalculator : ZeroFieldCalculator
2523     {
2524         private size_t _count = 0;
2525 
2526         final override void processNextEntry()
2527         {
2528             _count++;
2529         }
2530 
2531         final override string calculate(const ref SummarizerPrintOptions printOptions)
2532         {
2533             return printOptions.formatNumber(_count);
2534         }
2535     }
2536 }
2537 
2538 unittest // CountOperator
2539 {
2540     auto col1File = [["10"], ["9.5"], ["11"]];
2541     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
2542     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
2543 
2544     testZeroFieldOperator!CountOperator(col1File, "count", ["0", "1", "2", "3"]);
2545     testZeroFieldOperator!CountOperator(col2File, "count", ["0", "1", "2", "3"]);
2546     testZeroFieldOperator!CountOperator(col3File, "count", ["0", "1", "2", "3"]);
2547 }
2548 
2549 /** RetainOperator retains the first occurrence of a field, without changing the header.
2550  *
2551  * RetainOperator is intended for fields where the value is expected to be the same for
2552  * all occurrences of the unique key, and the goal is to pass the value through unchanged.
2553  * It is like FirstOperator, except that the original header is preserved. The original
2554  * header preservation is setup in the call to the SingleFieldOperation constructor.
2555  *
2556  * Notes:
2557  * - An option to signal an error if multiple values are encountered might be useful.
2558  */
2559 class RetainOperator : SingleFieldOperator
2560 {
2561     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2562     {
2563         super("retain", fieldIndex, missingPolicy, No.useHeaderSuffix, No.allowCustomHeader);
2564     }
2565 
2566     final override SingleFieldCalculator makeCalculator()
2567     {
2568         return new RetainCalculator(fieldIndex);
2569     }
2570 
2571     class RetainCalculator : SingleFieldCalculator
2572     {
2573         private bool _done = false;
2574         private string _value = "";
2575 
2576         this(size_t fieldIndex)
2577         {
2578             super(fieldIndex);
2579         }
2580 
2581         final override RetainOperator getOperator()
2582         {
2583             return this.outer;
2584         }
2585 
2586         final override void processNextField(const char[] nextField)
2587         {
2588             if (!_done)
2589             {
2590                 _value = nextField.to!string;
2591                 _done = true;
2592             }
2593         }
2594 
2595         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2596        {
2597             return _value;
2598         }
2599     }
2600 }
2601 
2602 unittest // RetainOperator
2603 {
2604     auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]];
2605     auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]];
2606     auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]];
2607 
2608     testSingleFieldOperator!RetainOperator(col1File, 0, "", ["", "r1c1", "r1c1", "r1c1"]);
2609     testSingleFieldOperator!RetainOperator(col2File, 0, "", ["", "r1c1", "r1c1", "r1c1"]);
2610     testSingleFieldOperator!RetainOperator(col2File, 1, "", ["", "r1c2", "r1c2", "r1c2"]);
2611     testSingleFieldOperator!RetainOperator(col3File, 0, "", ["", "r1c1", "r1c1", "r1c1"]);
2612     testSingleFieldOperator!RetainOperator(col3File, 1, "", ["", "r1c2", "r1c2", "r1c2"]);
2613     testSingleFieldOperator!RetainOperator(col3File, 2, "", ["", "r1c3", "r1c3", "r1c3"]);
2614 
2615     auto col1misFile = [[""], ["r2c1"], ["r3c1"]];
2616     testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "", "r2c1", "r2c1"],
2617                                            new MissingFieldPolicy(true, ""));  // Exclude missing
2618     testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "NA", "NA", "NA"],
2619                                            new MissingFieldPolicy(false, "NA"));  // Replace missing
2620 }
2621 
2622 /** FirstOperator outputs the first value found for the field.
2623  */
2624 class FirstOperator : SingleFieldOperator
2625 {
2626     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2627     {
2628         super("first", fieldIndex, missingPolicy);
2629     }
2630 
2631     final override SingleFieldCalculator makeCalculator()
2632     {
2633         return new FirstCalculator(fieldIndex);
2634     }
2635 
2636     class FirstCalculator : SingleFieldCalculator
2637     {
2638         private bool _done = false;
2639         private string _value = "";
2640 
2641         this(size_t fieldIndex)
2642         {
2643             super(fieldIndex);
2644         }
2645 
2646         final override FirstOperator getOperator()
2647         {
2648             return this.outer;
2649         }
2650 
2651         final override void processNextField(const char[] nextField)
2652         {
2653             if (!_done)
2654             {
2655                 _value = nextField.to!string;
2656                 _done = true;
2657             }
2658         }
2659 
2660         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2661         {
2662             return _value;
2663         }
2664     }
2665 }
2666 
2667 unittest // FirstOperator
2668 {
2669     auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]];
2670     auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]];
2671     auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]];
2672 
2673     testSingleFieldOperator!FirstOperator(col1File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]);
2674     testSingleFieldOperator!FirstOperator(col2File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]);
2675     testSingleFieldOperator!FirstOperator(col2File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]);
2676     testSingleFieldOperator!FirstOperator(col3File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]);
2677     testSingleFieldOperator!FirstOperator(col3File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]);
2678     testSingleFieldOperator!FirstOperator(col3File, 2, "first", ["", "r1c3", "r1c3", "r1c3"]);
2679 
2680     auto col1misFile = [[""], ["r2c1"], ["r3c1"]];
2681     testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "", "r2c1", "r2c1"],
2682                                           new MissingFieldPolicy(true, ""));  // Exclude missing
2683     testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "NA", "NA", "NA"],
2684                                           new MissingFieldPolicy(false, "NA"));  // Replace missing
2685 }
2686 
2687 /** LastOperator outputs the last value found for the field.
2688  */
2689 class LastOperator : SingleFieldOperator
2690 {
2691     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2692     {
2693         super("last", fieldIndex, missingPolicy);
2694     }
2695 
2696     final override SingleFieldCalculator makeCalculator()
2697     {
2698         return new LastCalculator(fieldIndex);
2699     }
2700 
2701     class LastCalculator : SingleFieldCalculator
2702     {
2703         private string _value = "";
2704 
2705         this(size_t fieldIndex)
2706         {
2707             super(fieldIndex);
2708         }
2709 
2710         final override LastOperator getOperator()
2711         {
2712             return this.outer;
2713         }
2714 
2715         final override void processNextField(const char[] nextField)
2716         {
2717             _value = nextField.to!string;
2718         }
2719 
2720         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2721         {
2722             return _value;
2723         }
2724     }
2725 }
2726 
2727 unittest // LastOperator
2728 {
2729     auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]];
2730     auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]];
2731     auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]];
2732 
2733     testSingleFieldOperator!LastOperator(col1File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]);
2734     testSingleFieldOperator!LastOperator(col2File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]);
2735     testSingleFieldOperator!LastOperator(col2File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]);
2736     testSingleFieldOperator!LastOperator(col3File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]);
2737     testSingleFieldOperator!LastOperator(col3File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]);
2738     testSingleFieldOperator!LastOperator(col3File, 2, "last", ["", "r1c3", "r2c3", "r3c3"]);
2739 
2740     auto col1misFile = [[""], ["r2c1"], ["r3c1"]];
2741     testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "", "r2c1", "r3c1"],
2742                                           new MissingFieldPolicy(true, ""));  // Exclude missing
2743     testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "NA", "r2c1", "r3c1"],
2744                                           new MissingFieldPolicy(false, "NA"));  // Replace missing
2745 }
2746 
2747 /** MinOperator output the minimum value for the field. This is a numeric operator.
2748  */
2749 class MinOperator : SingleFieldOperator
2750 {
2751     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2752     {
2753         super("min", fieldIndex, missingPolicy);
2754     }
2755 
2756     final override SingleFieldCalculator makeCalculator()
2757     {
2758         return new MinCalculator(fieldIndex);
2759     }
2760 
2761     class MinCalculator : SingleFieldCalculator
2762     {
2763         private bool _isFirst = true;
2764         private double _value = double.nan;
2765 
2766         this(size_t fieldIndex)
2767         {
2768             super(fieldIndex);
2769         }
2770 
2771         final override MinOperator getOperator()
2772         {
2773             return this.outer;
2774         }
2775 
2776         final override void processNextField(const char[] nextField)
2777         {
2778             double fieldValue = nextField.to!double;
2779             if (_isFirst)
2780             {
2781                 _value = fieldValue;
2782                 _isFirst = false;
2783             }
2784             else if (fieldValue < _value)
2785             {
2786                 _value = fieldValue;
2787             }
2788         }
2789 
2790         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2791         {
2792             return printOptions.formatNumber(_value);
2793         }
2794     }
2795 }
2796 
2797 unittest // MinOperator
2798 {
2799     auto col1File = [["10"], ["9.5"], ["11"]];
2800     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
2801     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
2802 
2803     testSingleFieldOperator!MinOperator(col1File, 0, "min", ["nan", "10", "9.5", "9.5"]);
2804     testSingleFieldOperator!MinOperator(col2File, 0, "min", ["nan", "20", "20", "20"]);
2805     testSingleFieldOperator!MinOperator(col2File, 1, "min", ["nan", "-30", "-30", "-31"]);
2806     testSingleFieldOperator!MinOperator(col3File, 0, "min", ["nan", "9009", "199", "199"]);
2807     testSingleFieldOperator!MinOperator(col3File, 1, "min", ["nan", "9", "0", "0"]);
2808     testSingleFieldOperator!MinOperator(col3File, 2, "min", ["nan", "-4.5", "-4.5", "-4.5"]);
2809 
2810     auto col1misFile = [[""], ["10"], ["-10"]];
2811     testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "nan", "10", "-10"],
2812                                           new MissingFieldPolicy(true, ""));  // Exclude missing
2813     testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "5", "5", "-10"],
2814                                           new MissingFieldPolicy(false, "5"));  // Replace missing
2815 }
2816 
2817 /** MaxOperator output the maximum value for the field. This is a numeric operator.
2818  */
2819 class MaxOperator : SingleFieldOperator
2820 {
2821     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2822     {
2823         super("max", fieldIndex, missingPolicy);
2824     }
2825 
2826     final override SingleFieldCalculator makeCalculator()
2827     {
2828         return new MaxCalculator(fieldIndex);
2829     }
2830 
2831     class MaxCalculator : SingleFieldCalculator
2832     {
2833         private bool _isFirst = true;
2834         private double _value = double.nan;
2835 
2836         this(size_t fieldIndex)
2837         {
2838             super(fieldIndex);
2839         }
2840 
2841         final override MaxOperator getOperator()
2842         {
2843             return this.outer;
2844         }
2845 
2846         final override void processNextField(const char[] nextField)
2847         {
2848             double fieldValue = nextField.to!double;
2849             if (_isFirst)
2850             {
2851                 _value = fieldValue;
2852                 _isFirst = false;
2853             }
2854             else if (fieldValue > _value)
2855             {
2856                 _value = fieldValue;
2857             }
2858         }
2859 
2860         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2861         {
2862             return printOptions.formatNumber(_value);
2863         }
2864     }
2865 }
2866 
2867 unittest // MaxOperator
2868 {
2869     auto col1File = [["10"], ["9.5"], ["11"]];
2870     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
2871     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
2872 
2873     testSingleFieldOperator!MaxOperator(col1File, 0, "max", ["nan", "10", "10", "11"]);
2874     testSingleFieldOperator!MaxOperator(col2File, 0, "max", ["nan", "20", "21", "22"]);
2875     testSingleFieldOperator!MaxOperator(col2File, 1, "max", ["nan", "-30", "-29", "-29"]);
2876     testSingleFieldOperator!MaxOperator(col3File, 0, "max", ["nan", "9009", "9009", "9009"]);
2877     testSingleFieldOperator!MaxOperator(col3File, 1, "max", ["nan", "9", "9", "9"]);
2878     testSingleFieldOperator!MaxOperator(col3File, 2, "max", ["nan", "-4.5", "-0.5", "12"]);
2879 
2880     auto col1misFile = [[""], ["-10"], ["10"]];
2881     testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "nan", "-10", "10"],
2882                                           new MissingFieldPolicy(true, ""));  // Exclude missing
2883     testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "5", "5", "10"],
2884                                           new MissingFieldPolicy(false, "5"));  // Replace missing
2885 }
2886 
2887 /** RangeOperator outputs the difference between the minimum and maximum values.
2888  *
2889  * If there is a single value, or all values are the same, the range is zero. This is
2890  * a numeric operator.
2891  */
2892 class RangeOperator : SingleFieldOperator
2893 {
2894     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2895     {
2896         super("range", fieldIndex, missingPolicy);
2897     }
2898 
2899     final override SingleFieldCalculator makeCalculator()
2900     {
2901         return new RangeCalculator(fieldIndex);
2902     }
2903 
2904     class RangeCalculator : SingleFieldCalculator
2905     {
2906         private bool _isFirst = true;
2907         private double _minValue = 0.0;
2908         private double _maxValue = 0.0;
2909 
2910         this(size_t fieldIndex)
2911         {
2912             super(fieldIndex);
2913         }
2914 
2915         final override RangeOperator getOperator()
2916         {
2917             return this.outer;
2918         }
2919 
2920         final override void processNextField(const char[] nextField)
2921         {
2922             double fieldValue = nextField.to!double;
2923             if (_isFirst)
2924             {
2925                 _minValue = _maxValue = fieldValue;
2926                 _isFirst = false;
2927             }
2928             else if (fieldValue > _maxValue)
2929             {
2930                 _maxValue = fieldValue;
2931             }
2932             else if (fieldValue < _minValue)
2933             {
2934                 _minValue = fieldValue;
2935             }
2936         }
2937 
2938         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2939         {
2940             return printOptions.formatNumber(_maxValue - _minValue);
2941         }
2942     }
2943 }
2944 
2945 unittest // RangeOperator
2946 {
2947     auto col1File = [["10"], ["9.5"], ["11"]];
2948     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
2949     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
2950 
2951     testSingleFieldOperator!RangeOperator(col1File, 0, "range", ["0", "0", "0.5", "1.5"]);
2952     testSingleFieldOperator!RangeOperator(col2File, 0, "range", ["0", "0", "1", "2"]);
2953     testSingleFieldOperator!RangeOperator(col2File, 1, "range", ["0", "0", "1", "2"]);
2954     testSingleFieldOperator!RangeOperator(col3File, 0, "range", ["0", "0", "8810", "8810"]);
2955     testSingleFieldOperator!RangeOperator(col3File, 1, "range", ["0", "0", "9", "9"]);
2956     testSingleFieldOperator!RangeOperator(col3File, 2, "range", ["0", "0", "4", "16.5"]);
2957 
2958     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]];
2959     testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "0", "0", "0.5", "1.5"],
2960                                           new MissingFieldPolicy(true, ""));  // Exclude missing
2961     testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "4.5", "4.5", "4.5", "5.5"],
2962                                           new MissingFieldPolicy(false, "5.5"));  // Replace missing
2963 }
2964 
2965 /** SumOperator produces the sum of all the values. This is a numeric operator.
2966  */
2967 class SumOperator : SingleFieldOperator
2968 {
2969     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
2970     {
2971         super("sum", fieldIndex, missingPolicy);
2972     }
2973 
2974     final override SingleFieldCalculator makeCalculator()
2975     {
2976         return new SumCalculator(fieldIndex);
2977     }
2978 
2979     class SumCalculator : SingleFieldCalculator
2980     {
2981         private double _total = 0.0;
2982 
2983         this(size_t fieldIndex)
2984         {
2985             super(fieldIndex);
2986         }
2987 
2988         final override SumOperator getOperator()
2989         {
2990             return this.outer;
2991         }
2992 
2993         final override void processNextField(const char[] nextField)
2994         {
2995             _total += nextField.to!double;
2996         }
2997 
2998         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
2999         {
3000             return printOptions.formatNumber(_total);
3001         }
3002     }
3003 }
3004 
3005 unittest // SumOperator
3006 {
3007     auto col1File = [["10"], ["9.5"], ["11"]];
3008     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3009     auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]];
3010 
3011     testSingleFieldOperator!SumOperator(col1File, 0, "sum", ["0", "10", "19.5", "30.5"]);
3012     testSingleFieldOperator!SumOperator(col2File, 0, "sum", ["0", "20", "41", "63"]);
3013     testSingleFieldOperator!SumOperator(col2File, 1, "sum", ["0", "-30", "-59", "-90"]);
3014     testSingleFieldOperator!SumOperator(col3File, 0, "sum", ["0", "9009", "9208", "12211"]);
3015     testSingleFieldOperator!SumOperator(col3File, 1, "sum", ["0", "9", "9", "9.2"]);
3016     testSingleFieldOperator!SumOperator(col3File, 2, "sum", ["0", "-4.5", "-5", "7"]);
3017 
3018     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]];
3019     testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "0", "10", "10", "19.5", "30.5"],
3020                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3021     testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "1.5", "11.5", "13", "22.5", "33.5"],
3022                                           new MissingFieldPolicy(false, "1.5"));  // Replace missing
3023 }
3024 
3025 /** MeanOperator produces the mean (average) of all the values. This is a numeric operator.
3026  */
3027 class MeanOperator : SingleFieldOperator
3028 {
3029     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3030     {
3031         super("mean", fieldIndex, missingPolicy);
3032     }
3033 
3034     final override SingleFieldCalculator makeCalculator()
3035     {
3036         return new MeanCalculator(fieldIndex);
3037     }
3038 
3039     class MeanCalculator : SingleFieldCalculator
3040     {
3041         private double _total = 0.0;
3042         private size_t _count = 0;
3043 
3044         this(size_t fieldIndex)
3045         {
3046             super(fieldIndex);
3047         }
3048 
3049         final override MeanOperator getOperator()
3050         {
3051             return this.outer;
3052         }
3053 
3054         final override void processNextField(const char[] nextField)
3055         {
3056             _total += nextField.to!double;
3057             _count++;
3058         }
3059 
3060         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3061         {
3062             return printOptions.formatNumber(
3063                 (_count > 0) ? (_total / _count.to!double) : double.nan);
3064         }
3065     }
3066 }
3067 
3068 unittest // MeanOperator
3069 {
3070     auto col1File = [["10"], ["9.5"], ["7.5"]];
3071     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3072     auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]];
3073 
3074     testSingleFieldOperator!MeanOperator(col1File, 0, "mean", ["nan", "10", "9.75", "9"]);
3075     testSingleFieldOperator!MeanOperator(col2File, 0, "mean", ["nan", "20", "20.5", "21"]);
3076     testSingleFieldOperator!MeanOperator(col2File, 1, "mean", ["nan", "-30", "-29.5", "-30"]);
3077     testSingleFieldOperator!MeanOperator(col3File, 0, "mean", ["nan", "9009", "4509", "4509"]);
3078     testSingleFieldOperator!MeanOperator(col3File, 1, "mean", ["nan", "9", "4.5", "2"]);
3079     testSingleFieldOperator!MeanOperator(col3File, 2, "mean", ["nan", "-4.5", "-3", "2"]);
3080 
3081     auto col1misFile = [[""], ["6"], [""], ["14"], ["40"]];
3082     testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "nan", "6", "6", "10", "20"],
3083                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3084     testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "0", "3", "2", "5", "12"],
3085                                           new MissingFieldPolicy(false, "0"));  // Replace missing
3086 }
3087 
3088 /** MedianOperator produces the median of all the values. This is a numeric operator.
3089  *
3090  * All the field values are stored in memory as part of this calculation. This is
3091  * handled by unique key value lists.
3092  */
3093 class MedianOperator : SingleFieldOperator
3094 {
3095     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3096     {
3097         super("median", fieldIndex, missingPolicy);
3098         setSaveFieldValuesNumeric();
3099     }
3100 
3101     final override SingleFieldCalculator makeCalculator()
3102     {
3103         return new MedianCalculator(fieldIndex);
3104     }
3105 
3106     class MedianCalculator : SingleFieldCalculator
3107     {
3108         this(size_t fieldIndex)
3109         {
3110             super(fieldIndex);
3111         }
3112 
3113         final override MedianOperator getOperator()
3114         {
3115             return this.outer;
3116         }
3117 
3118         /* Work is done by saving the field values. */
3119         final override void processNextField(const char[] nextField)
3120         { }
3121 
3122         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3123         {
3124             return printOptions.formatNumber(valuesLists.numericValuesMedian(fieldIndex));
3125         }
3126     }
3127 }
3128 
3129 unittest // MedianOperator
3130 {
3131     auto col1File = [["10"], ["9.5"], ["7.5"]];
3132     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3133     auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]];
3134 
3135     testSingleFieldOperator!MedianOperator(col1File, 0, "median", ["nan", "10", "9.75", "9.5"]);
3136     testSingleFieldOperator!MedianOperator(col2File, 0, "median", ["nan", "20", "20.5", "21"]);
3137     testSingleFieldOperator!MedianOperator(col2File, 1, "median", ["nan", "-30", "-29.5", "-30"]);
3138     testSingleFieldOperator!MedianOperator(col3File, 0, "median", ["nan", "9009", "4509", "4509"]);
3139     testSingleFieldOperator!MedianOperator(col3File, 1, "median", ["nan", "9", "4.5", "0"]);
3140     testSingleFieldOperator!MedianOperator(col3File, 2, "median", ["nan", "-4.5", "-3", "-1.5"]);
3141 
3142     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]];
3143     testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "nan", "10", "10", "9.75", "9.5"],
3144                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3145     testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "0", "5", "0", "4.75", "7.5"],
3146                                           new MissingFieldPolicy(false, "0"));  // Replace missing
3147 }
3148 
3149 /** QuantileOperator produces the value representing the data at a cummulative probability.
3150  * This is a numeric operation.
3151  *
3152  * As an example, quantiles might be produced for the 0.25, 0.5, and 0.75 probabilities
3153  * (alternately, the 25th, 50th, and 75th percentile ranks, the 50th percentile being the
3154  * median). Data is sorted is ascending order. This operator takes one percentile, but it
3155  * is common to generate multiple quantile ranks for the same field when summarizing.
3156  *
3157  * All the field's values are stored in memory as part of this calculation. This is
3158  * handled by unique key value lists.
3159  */
3160 class QuantileOperator : SingleFieldOperator
3161 {
3162     private double _prob;
3163 
3164     this(size_t fieldIndex, MissingFieldPolicy missingPolicy, double probability)
3165     {
3166         assert(0.0 <= probability && probability <= 1.0);
3167         import std.format : format;
3168 
3169         string header = (probability == 0.0) ? "pct0" : format("pct%02g", probability * 100.0);
3170         super(header, fieldIndex, missingPolicy);
3171         _prob = probability;
3172         setSaveFieldValuesNumeric();
3173     }
3174 
3175     final override SingleFieldCalculator makeCalculator()
3176     {
3177         return new QuantileCalculator(fieldIndex);
3178     }
3179 
3180     class QuantileCalculator : SingleFieldCalculator
3181     {
3182         this(size_t fieldIndex)
3183         {
3184             super(fieldIndex);
3185         }
3186 
3187         final override QuantileOperator getOperator()
3188         {
3189             return this.outer;
3190         }
3191 
3192         /* Work is done by saving the field values. */
3193         final override void processNextField(const char[] nextField)
3194         { }
3195 
3196         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3197         {
3198             import tsv_numerics : quantile;
3199             return printOptions.formatNumber(
3200                 quantile(this.outer._prob, valuesLists.numericValuesSorted(fieldIndex)));
3201         }
3202     }
3203 }
3204 
3205 unittest // QuantileOperator
3206 {
3207     auto col1File = [["10"], ["9.5"], ["7.5"]];
3208     auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]];
3209     auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]];
3210 
3211     auto defaultMissing = new MissingFieldPolicy;
3212 
3213     /* Same as the median tests. */
3214     testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct50", ["nan", "10", "9.75", "9.5"], defaultMissing, 0.50);
3215     testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct50", ["nan", "20", "20.5", "21"], defaultMissing, 0.50);
3216     testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct50", ["nan", "-30", "-29.5", "-30"], defaultMissing, 0.50);
3217     testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct50", ["nan", "9009", "4509", "4509"], defaultMissing, 0.50);
3218     testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct50", ["nan", "9", "4.5", "0"], defaultMissing, 0.50);
3219     testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct50", ["nan", "-4.5", "-3", "-1.5"], defaultMissing, 0.50);
3220 
3221     /* The extremes (0, 1), are min and max. */
3222     testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct0", ["nan", "10", "9.5", "7.5"], defaultMissing, 0.0);
3223     testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct0", ["nan", "20", "20", "20"], defaultMissing, 0.0);
3224     testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct0", ["nan", "-30", "-30", "-31"], defaultMissing, 0.0);
3225     testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct0", ["nan", "9009", "9", "9"], defaultMissing, 0.0);
3226     testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct0", ["nan", "9", "0", "-3"], defaultMissing, 0.0);
3227     testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct0", ["nan", "-4.5", "-4.5", "-4.5"], defaultMissing, 0.0);
3228 
3229     testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct100", ["nan", "10", "10", "10"], defaultMissing, 1.0);
3230     testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct100", ["nan", "20", "21", "22"], defaultMissing, 1.0);
3231     testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct100", ["nan", "-30", "-29", "-29"], defaultMissing, 1.0);
3232     testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct100", ["nan", "9009", "9009", "9009"], defaultMissing, 1.0);
3233     testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct100", ["nan", "9", "9", "9"], defaultMissing, 1.0);
3234     testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct100", ["nan", "-4.5", "-1.5", "12"], defaultMissing, 1.0);
3235 
3236     /* For missing policies, re-use the median tests. */
3237     auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]];
3238     testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "nan", "10", "10", "9.75", "9.5"],
3239                                                  new MissingFieldPolicy(true, ""), 0.5);  // Exclude missing
3240     testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "0", "5", "0", "4.75", "7.5"],
3241                                                  new MissingFieldPolicy(false, "0"), 0.5);  // Replace missing
3242 }
3243 
3244 /** MadOperator produces the median absolute deviation from the median. This is a numeric
3245  * operation.
3246  *
3247  * The result is the raw MAD value, without a normalization applied.
3248  *
3249  * All the field values are stored in memory as part of this calculation. This is
3250  * handled by unique key value lists.
3251  */
3252 class MadOperator : SingleFieldOperator
3253 {
3254     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3255     {
3256         super("mad", fieldIndex, missingPolicy);
3257         setSaveFieldValuesNumeric();
3258     }
3259 
3260     final override SingleFieldCalculator makeCalculator()
3261     {
3262         return new MadCalculator(fieldIndex);
3263     }
3264 
3265     class MadCalculator : SingleFieldCalculator
3266     {
3267         this(size_t fieldIndex)
3268         {
3269             super(fieldIndex);
3270         }
3271 
3272         final override MadOperator getOperator()
3273         {
3274             return this.outer;
3275         }
3276 
3277         /* Work is done by saving the field values. */
3278         final override void processNextField(const char[] nextField)
3279         { }
3280 
3281         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3282         {
3283             import std.math : abs;
3284             import tsv_numerics : rangeMedian;
3285 
3286             auto median = valuesLists.numericValuesMedian(fieldIndex);
3287             auto values = valuesLists.numericValues(fieldIndex);
3288             auto medianDevs = new double[values.length];
3289             foreach (int i, double v; values)
3290                 medianDevs[i] = abs(v - median);
3291 
3292             return printOptions.formatNumber(medianDevs.rangeMedian);
3293         }
3294     }
3295 }
3296 
3297 unittest // MadOperator
3298 {
3299     auto col1File = [["10"], ["15"], ["20"], ["25"], ["30"]];
3300     auto col2File = [["2", "50"], ["2", "51"], ["2", "52"]];
3301     auto col3File = [["16", "8", "-4"], ["8", "8", "-2"], ["8", "16", "0"]];
3302 
3303     testSingleFieldOperator!MadOperator(col1File, 0, "mad", ["nan", "0", "2.5", "5", "5", "5"]);
3304     testSingleFieldOperator!MadOperator(col2File, 0, "mad", ["nan", "0", "0", "0"]);
3305     testSingleFieldOperator!MadOperator(col2File, 1, "mad", ["nan", "0", "0.5", "1"]);
3306     testSingleFieldOperator!MadOperator(col3File, 0, "mad", ["nan", "0", "4", "0"]);
3307     testSingleFieldOperator!MadOperator(col3File, 1, "mad", ["nan", "0", "0", "0"]);
3308     testSingleFieldOperator!MadOperator(col3File, 2, "mad", ["nan", "0", "1", "2"]);
3309 
3310     auto col1misFile = [[""], ["16"], [""], ["32"], ["-4"]];
3311     testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "nan", "0", "0", "8", "16"],
3312                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3313     testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "0", "8", "0", "8", "4"],
3314                                           new MissingFieldPolicy(false, "0"));  // Replace missing
3315 }
3316 
3317 /** Generates the variance of the fields values. This is a numeric operator.
3318  */
3319 class VarianceOperator : SingleFieldOperator
3320 {
3321     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3322     {
3323         super("var", fieldIndex, missingPolicy);
3324     }
3325 
3326     final override SingleFieldCalculator makeCalculator()
3327     {
3328         return new VarianceCalculator(fieldIndex);
3329     }
3330 
3331     class VarianceCalculator : SingleFieldCalculator
3332     {
3333         private double _count = 0.0;
3334         private double _mean = 0.0;
3335         private double _m2 = 0.0;     // Sum of squares of differences from current mean
3336 
3337         this(size_t fieldIndex)
3338         {
3339             super(fieldIndex);
3340         }
3341 
3342         final override VarianceOperator getOperator()
3343         {
3344             return this.outer;
3345         }
3346 
3347         final override void processNextField(const char[] nextField)
3348         {
3349             _count += 1.0;
3350             double fieldValue = nextField.to!double;
3351             double delta = fieldValue - _mean;
3352             _mean += delta / _count;
3353             _m2 += delta * (fieldValue - _mean);
3354         }
3355 
3356         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3357         {
3358             return printOptions.formatNumber(
3359                 (_count >= 2.0) ? (_m2 / (_count - 1.0)) : double.nan);
3360         }
3361     }
3362 }
3363 
3364 unittest // VarianceOperator
3365 {
3366     auto col1File = [["5"], ["10"], ["15"]];
3367     auto col2File = [["-5", "-5"], ["-10", "0"], ["-15", "5"]];
3368     auto col3File = [["1", "2", "100"], ["2", "3", "100"], ["3", "4", "103"]];
3369 
3370     testSingleFieldOperator!VarianceOperator(col1File, 0, "var", ["nan", "nan", "12.5", "25"]);
3371     testSingleFieldOperator!VarianceOperator(col2File, 0, "var", ["nan", "nan", "12.5", "25"]);
3372     testSingleFieldOperator!VarianceOperator(col2File, 1, "var", ["nan", "nan", "12.5", "25"]);
3373     testSingleFieldOperator!VarianceOperator(col3File, 0, "var", ["nan", "nan", "0.5", "1"]);
3374     testSingleFieldOperator!VarianceOperator(col3File, 1, "var", ["nan", "nan", "0.5", "1"]);
3375     testSingleFieldOperator!VarianceOperator(col3File, 2, "var", ["nan", "nan", "0", "3"]);
3376 
3377     auto col1misFile = [["5"], ["10"], [""]];
3378     testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "12.5"],
3379                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3380     testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "25"],
3381                                           new MissingFieldPolicy(false, "15"));  // Replace missing
3382 }
3383 
3384 /** Generates the standard deviation of the fields values. This is a numeric operator.
3385  */
3386 class StDevOperator : SingleFieldOperator
3387 {
3388     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3389     {
3390         super("stdev", fieldIndex, missingPolicy);
3391     }
3392 
3393     final override SingleFieldCalculator makeCalculator()
3394     {
3395         return new StDevCalculator(fieldIndex);
3396     }
3397 
3398     class StDevCalculator : SingleFieldCalculator
3399     {
3400         private double _count = 0.0;
3401         private double _mean = 0.0;
3402         private double _m2 = 0.0;     // Sum of squares of differences from current mean
3403 
3404         this(size_t fieldIndex)
3405         {
3406             super(fieldIndex);
3407         }
3408 
3409         final override StDevOperator getOperator()
3410         {
3411             return this.outer;
3412         }
3413 
3414         final override void processNextField(const char[] nextField)
3415         {
3416             _count += 1.0;
3417             double fieldValue = nextField.to!double;
3418             double delta = fieldValue - _mean;
3419             _mean += delta / _count;
3420             _m2 += delta * (fieldValue - _mean);
3421         }
3422 
3423         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3424         {
3425             import std.math : sqrt;
3426             return printOptions.formatNumber(
3427                 (_count >= 2.0) ? (_m2 / (_count - 1.0)).sqrt : double.nan);
3428         }
3429     }
3430 }
3431 
3432 /* StDevOperator unit tests - These would be improved with a tolerance option.
3433  */
3434 unittest
3435 {
3436     auto col1File = [["1"], ["4"], ["7"]];
3437     auto col2File = [["3", "3"], ["3", "9"], ["7", "15"]];
3438     auto col3File = [["11", "10", "10"], ["24", "22", "25"], ["37", "34", "40"]];
3439 
3440     testSingleFieldOperator!StDevOperator(col1File, 0, "stdev", ["nan", "nan", "2.12132034356", "3"]);
3441     testSingleFieldOperator!StDevOperator(col2File, 0, "stdev", ["nan", "nan", "0", "2.30940107676"]);
3442     testSingleFieldOperator!StDevOperator(col2File, 1, "stdev", ["nan", "nan", "4.24264068712", "6"]);
3443     testSingleFieldOperator!StDevOperator(col3File, 0, "stdev", ["nan", "nan", "9.19238815543", "13"]);
3444     testSingleFieldOperator!StDevOperator(col3File, 1, "stdev", ["nan", "nan", "8.48528137424", "12"]);
3445     testSingleFieldOperator!StDevOperator(col3File, 2, "stdev", ["nan", "nan", "10.6066017178", "15"]);
3446 
3447     auto col1misFile = [["1"], ["4"], [""]];
3448     testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "2.12132034356"],
3449                                           new MissingFieldPolicy(true, ""));  // Exclude missing
3450     testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "3"],
3451                                           new MissingFieldPolicy(false, "7"));  // Replace missing
3452 }
3453 
3454 /** UniqueCountOperator generates the number of unique values. Unique values are
3455  * based on exact text match calculation, not a numeric comparison.
3456  *
3457  * All the unique field values are stored in memory as part of this calculation.
3458  */
3459 class UniqueCountOperator : SingleFieldOperator
3460 {
3461     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3462     {
3463         super("unique_count", fieldIndex, missingPolicy);
3464     }
3465 
3466     final override SingleFieldCalculator makeCalculator()
3467     {
3468         return new UniqueCountCalculator(fieldIndex);
3469     }
3470 
3471     class UniqueCountCalculator : SingleFieldCalculator
3472     {
3473         private bool[string] _values;
3474 
3475         this(size_t fieldIndex)
3476         {
3477             super(fieldIndex);
3478         }
3479 
3480         final override UniqueCountOperator getOperator()
3481         {
3482             return this.outer;
3483         }
3484 
3485         final override void processNextField(const char[] nextField)
3486         {
3487             if (nextField !in _values) _values[nextField.to!string] = true;
3488         }
3489 
3490         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3491         {
3492             return printOptions.formatNumber(_values.length);
3493         }
3494     }
3495 }
3496 
3497 unittest // UniqueCount
3498 {
3499     auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]];
3500     auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]];
3501     auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]];
3502 
3503     testSingleFieldOperator!UniqueCountOperator(col1File, 0, "unique_count", ["0", "1", "2", "3", "3", "3", "3", "3", "4"]);
3504     testSingleFieldOperator!UniqueCountOperator(col2File, 0, "unique_count", ["0", "1", "2", "2"]);
3505     testSingleFieldOperator!UniqueCountOperator(col2File, 1, "unique_count", ["0", "1", "1", "2"]);
3506     testSingleFieldOperator!UniqueCountOperator(col3File, 0, "unique_count", ["0", "1", "2", "3"]);
3507     testSingleFieldOperator!UniqueCountOperator(col3File, 1, "unique_count", ["0", "1", "2", "2"]);
3508     testSingleFieldOperator!UniqueCountOperator(col3File, 2, "unique_count", ["0", "1", "2", "3"]);
3509 
3510     auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]];
3511     testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "0", "1", "1", "2", "3", "3", "3", "3", "3", "4"],
3512                                                 new MissingFieldPolicy(true, ""));  // Exclude missing
3513 
3514 
3515     testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "1", "2", "2", "3", "4", "4", "4", "4", "4", "5"],
3516                                                 new MissingFieldPolicy(false, "XYZ"));  // Replace missing
3517 }
3518 
3519 /** MissingCountOperator generates the number of missing values. This overrides
3520  * the global missingFieldsPolicy.
3521  */
3522 class MissingCountOperator : SingleFieldOperator
3523 {
3524     private MissingFieldPolicy _globalMissingPolicy;
3525 
3526     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3527     {
3528         _globalMissingPolicy = missingPolicy;
3529         super("missing_count", fieldIndex, new MissingFieldPolicy(false, ""));
3530     }
3531 
3532     final override SingleFieldCalculator makeCalculator()
3533     {
3534         return new MissingCountCalculator(fieldIndex);
3535     }
3536 
3537     class MissingCountCalculator : SingleFieldCalculator
3538     {
3539         private size_t _missingCount = 0;
3540 
3541         this(size_t fieldIndex)
3542         {
3543             super(fieldIndex);
3544         }
3545 
3546         final override MissingCountOperator getOperator()
3547         {
3548             return this.outer;
3549         }
3550 
3551         final override void processNextField(const char[] nextField)
3552         {
3553             if (this.outer._globalMissingPolicy.isMissingField(nextField)) _missingCount++;
3554         }
3555 
3556         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3557         {
3558             return printOptions.formatNumber(_missingCount);
3559         }
3560     }
3561 }
3562 
3563 unittest // MissingCount
3564 {
3565     auto col1File = [["a"], ["b"], [""], [" "], [""]];
3566     auto col2File = [["abc", ""], ["", ""], ["def", ""]];
3567     auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]];
3568 
3569     testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"]);
3570     testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"]);
3571     testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"]);
3572     testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"]);
3573     testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"]);
3574     testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"]);
3575 
3576     auto excludeMissing = new MissingFieldPolicy(true, "");
3577     auto replaceMissing = new MissingFieldPolicy(false, "X");
3578 
3579     testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], excludeMissing);
3580     testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], excludeMissing);
3581     testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], excludeMissing);
3582     testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], excludeMissing);
3583     testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], excludeMissing);
3584     testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], excludeMissing);
3585 
3586     testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], replaceMissing);
3587     testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], replaceMissing);
3588     testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], replaceMissing);
3589     testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], replaceMissing);
3590     testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], replaceMissing);
3591     testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], replaceMissing);
3592 }
3593 
3594 /** NotMissingCountOperator generates the number of not-missing values. This overrides
3595  * the global missingFieldsPolicy.
3596  */
3597 class NotMissingCountOperator : SingleFieldOperator
3598 {
3599     private MissingFieldPolicy _globalMissingPolicy;
3600 
3601     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3602     {
3603         _globalMissingPolicy = missingPolicy;
3604         super("not_missing_count", fieldIndex, new MissingFieldPolicy(false, ""));
3605     }
3606 
3607     final override SingleFieldCalculator makeCalculator()
3608     {
3609         return new NotMissingCountCalculator(fieldIndex);
3610     }
3611 
3612     class NotMissingCountCalculator : SingleFieldCalculator
3613     {
3614         private size_t _notMissingCount = 0;
3615 
3616         this(size_t fieldIndex)
3617         {
3618             super(fieldIndex);
3619         }
3620 
3621         final override NotMissingCountOperator getOperator()
3622         {
3623             return this.outer;
3624         }
3625 
3626         final override void processNextField(const char[] nextField)
3627         {
3628             if (!this.outer._globalMissingPolicy.isMissingField(nextField)) _notMissingCount++;
3629         }
3630 
3631         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3632         {
3633             return printOptions.formatNumber(_notMissingCount);
3634         }
3635     }
3636 }
3637 
3638 unittest // NotMissingCount
3639 {
3640     auto col1File = [["a"], ["b"], [""], [" "], [""]];
3641     auto col2File = [["abc", ""], ["", ""], ["def", ""]];
3642     auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]];
3643 
3644     testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"]);
3645     testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"]);
3646     testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"]);
3647     testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"]);
3648     testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"]);
3649     testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"]);
3650 
3651     auto excludeMissing = new MissingFieldPolicy(true, "");
3652     auto replaceMissing = new MissingFieldPolicy(false, "X");
3653 
3654     testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], excludeMissing);
3655     testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], excludeMissing);
3656     testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], excludeMissing);
3657     testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], excludeMissing);
3658     testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], excludeMissing);
3659     testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], excludeMissing);
3660 
3661     testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], replaceMissing);
3662     testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], replaceMissing);
3663     testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], replaceMissing);
3664     testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], replaceMissing);
3665     testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], replaceMissing);
3666     testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], replaceMissing);
3667 }
3668 
3669 /** ModeOperator outputs the most frequent value seen. In the event of a tie, the
3670  * first value seen is produced.
3671  *
3672  * All the field values are stored in memory as part of this calculation.
3673  *
3674  */
3675 class ModeOperator : SingleFieldOperator
3676 {
3677     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3678     {
3679         super("mode", fieldIndex, missingPolicy);
3680     }
3681 
3682     final override SingleFieldCalculator makeCalculator()
3683     {
3684         return new ModeCalculator(fieldIndex);
3685     }
3686 
3687     class ModeCalculator : SingleFieldCalculator
3688     {
3689         private size_t[string] _valueCounts;
3690         private Appender!(string[]) _uniqueValues;
3691 
3692         this(size_t fieldIndex)
3693         {
3694             super(fieldIndex);
3695         }
3696 
3697         final override ModeOperator getOperator()
3698         {
3699             return this.outer;
3700         }
3701 
3702         final override void processNextField(const char[] nextField)
3703         {
3704             auto countPtr = (nextField in _valueCounts);
3705 
3706             if (countPtr is null)
3707             {
3708                 string value = nextField.to!string;
3709                 _uniqueValues.put(value);
3710                 _valueCounts[value] = 1;
3711             }
3712             else
3713             {
3714                 (*countPtr)++;
3715             }
3716         }
3717 
3718         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3719         {
3720             string modeValue = "";
3721             size_t modeCount = 0;
3722 
3723             foreach (value; _uniqueValues.data)
3724             {
3725                 assert(value in _valueCounts);
3726 
3727                 auto count = _valueCounts[value];
3728 
3729                 if (count > modeCount)
3730                 {
3731                     modeValue = value;
3732                     modeCount = count;
3733                 }
3734             }
3735 
3736             return modeValue;
3737         }
3738     }
3739 }
3740 
3741 unittest // ModeOperator
3742 {
3743     auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]];
3744     auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]];
3745     auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]];
3746 
3747     testSingleFieldOperator!ModeOperator(col1File, 0, "mode", ["", "a", "a", "a", "c", "b", "b", "b"]);
3748     testSingleFieldOperator!ModeOperator(col2File, 0, "mode", ["", "abc", "abc", "def"]);
3749     testSingleFieldOperator!ModeOperator(col2File, 1, "mode", ["", "pqr", "pqr", "pqr"]);
3750     testSingleFieldOperator!ModeOperator(col3File, 0, "mode", ["", "1.0", "1.0", "1.0"]);
3751     testSingleFieldOperator!ModeOperator(col3File, 1, "mode", ["", "1", "1", "a"]);
3752     testSingleFieldOperator!ModeOperator(col3File, 2, "mode", ["", "a", "a", "a"]);
3753 
3754     auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]];
3755     testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "", "a", "a", "a", "a", "c", "b", "b"],
3756                                          new MissingFieldPolicy(true, ""));  // Exclude missing
3757 
3758 
3759     testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "X", "X", "X", "X", "X", "X", "X", "b"],
3760                                          new MissingFieldPolicy(false, "X"));  // Replace missing
3761 }
3762 
3763 /** ModeCountOperator outputs the count of the most frequent value seen.
3764  *
3765  * All the field values are stored in memory as part of this calculation.
3766  *
3767  */
3768 class ModeCountOperator : SingleFieldOperator
3769 {
3770     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3771     {
3772         super("mode_count", fieldIndex, missingPolicy);
3773     }
3774 
3775     final override SingleFieldCalculator makeCalculator()
3776     {
3777         return new ModeCountCalculator(fieldIndex);
3778     }
3779 
3780     class ModeCountCalculator : SingleFieldCalculator
3781     {
3782         private size_t[string] _valueCounts;
3783 
3784         this(size_t fieldIndex)
3785         {
3786             super(fieldIndex);
3787         }
3788 
3789         final override ModeCountOperator getOperator()
3790         {
3791             return this.outer;
3792         }
3793 
3794         final override void processNextField(const char[] nextField)
3795         {
3796             auto countPtr = (nextField in _valueCounts);
3797 
3798             if (countPtr is null)
3799             {
3800                 string value = nextField.to!string;
3801                 _valueCounts[value] = 1;
3802             }
3803             else
3804             {
3805                 (*countPtr)++;
3806             }
3807         }
3808 
3809         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3810         {
3811             size_t modeCount = 0;
3812             foreach (count; _valueCounts.byValue) if (count > modeCount) modeCount = count;
3813             return printOptions.formatNumber(modeCount);
3814         }
3815     }
3816 }
3817 
3818 unittest // ModeCountOperator
3819 {
3820     auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]];
3821     auto col2File = [["abc", ""], ["def", ""], ["def", "xyz"]];
3822     auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]];
3823 
3824     testSingleFieldOperator!ModeCountOperator(col1File, 0, "mode_count", ["0", "1", "1", "1", "2", "2", "3", "3"]);
3825     testSingleFieldOperator!ModeCountOperator(col2File, 0, "mode_count", ["0", "1", "1", "2"]);
3826     testSingleFieldOperator!ModeCountOperator(col2File, 1, "mode_count", ["0", "1", "2", "2"]);
3827     testSingleFieldOperator!ModeCountOperator(col3File, 0, "mode_count", ["0", "1", "1", "1"]);
3828     testSingleFieldOperator!ModeCountOperator(col3File, 1, "mode_count", ["0", "1", "1", "2"]);
3829     testSingleFieldOperator!ModeCountOperator(col3File, 2, "mode_count", ["0", "1", "1", "1"]);
3830 
3831     auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]];
3832     testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "0", "1", "1", "1", "1", "2", "2", "3"],
3833                                               new MissingFieldPolicy(true, ""));  // Exclude missing
3834 
3835 
3836     testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "1", "1", "2", "2", "2", "2", "2", "3"],
3837                                               new MissingFieldPolicy(false, "X"));  // Replace missing
3838 }
3839 
3840 /** ValuesOperator outputs each value delimited by an alternate delimiter character.
3841  *
3842  * All the field values are stored in memory as part of this calculation. This is
3843  * handled by unique key value lists.
3844  */
3845 
3846 class ValuesOperator : SingleFieldOperator
3847 {
3848     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3849     {
3850         super("values", fieldIndex, missingPolicy);
3851         setSaveFieldValuesText();
3852     }
3853 
3854     final override SingleFieldCalculator makeCalculator()
3855     {
3856         return new ValuesCalculator(fieldIndex);
3857     }
3858 
3859     class ValuesCalculator : SingleFieldCalculator
3860     {
3861         this(size_t fieldIndex)
3862         {
3863             super(fieldIndex);
3864         }
3865 
3866         final override ValuesOperator getOperator()
3867         {
3868             return this.outer;
3869         }
3870 
3871         /* Work is done by saving the field values. */
3872         final override void processNextField(const char[] nextField)
3873         { }
3874 
3875         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3876         {
3877             return valuesLists.textValues(fieldIndex).join(printOptions.valuesDelimiter);
3878         }
3879     }
3880 }
3881 
3882 unittest // ValuesOperator
3883 {
3884     auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]];
3885     auto col2File = [["", "50"], ["", "51"], ["xyz", "52"]];
3886     auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "---"]];
3887 
3888     testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e|", "a||b|cd|e||a"]);
3889     testSingleFieldOperator!ValuesOperator(col2File, 0, "values", ["", "", "|", "||xyz"]);
3890     testSingleFieldOperator!ValuesOperator(col2File, 1, "values", ["", "50", "50|51", "50|51|52"]);
3891     testSingleFieldOperator!ValuesOperator(col3File, 0, "values", ["", "z", "z|y", "z|y|w"]);
3892     testSingleFieldOperator!ValuesOperator(col3File, 1, "values", ["", "a", "a|ab", "a|ab|ba"]);
3893     testSingleFieldOperator!ValuesOperator(col3File, 2, "values", ["", "-", "-|--", "-|--|---"]);
3894 
3895     testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e|a"],
3896                                          new MissingFieldPolicy(true, ""));  // Exclude missing
3897 
3898 
3899     testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e|X", "a|X|b|cd|e|X|a"],
3900                                          new MissingFieldPolicy(false, "X"));  // Replace missing
3901 }
3902 
3903 /** UniqueValuesOperator outputs each unique value delimited by an alternate delimiter
3904  * character. Values are output in the order seen.
3905  *
3906  * All unique field values are stored in memory as part of this calculation.
3907  *
3908  */
3909 class UniqueValuesOperator : SingleFieldOperator
3910 {
3911     this(size_t fieldIndex, MissingFieldPolicy missingPolicy)
3912     {
3913         super("unique_values", fieldIndex, missingPolicy);
3914     }
3915 
3916     final override SingleFieldCalculator makeCalculator()
3917     {
3918         return new UniqueValuesCalculator(fieldIndex);
3919     }
3920 
3921     class UniqueValuesCalculator : SingleFieldCalculator
3922     {
3923         private size_t[string] _valuesHash;
3924         private Appender!(string[]) _uniqueValues;
3925 
3926         this(size_t fieldIndex)
3927         {
3928             super(fieldIndex);
3929         }
3930 
3931         final override UniqueValuesOperator getOperator()
3932         {
3933             return this.outer;
3934         }
3935 
3936         final override void processNextField(const char[] nextField)
3937         {
3938             auto ptr = (nextField in _valuesHash);
3939 
3940             if (ptr is null)
3941             {
3942                 string value = nextField.to!string;
3943                 _uniqueValues.put(value);
3944                 _valuesHash[value] = 1;
3945             }
3946         }
3947 
3948         final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions)
3949         {
3950             return _uniqueValues.data.join(printOptions.valuesDelimiter);
3951         }
3952     }
3953 }
3954 
3955 unittest // UniqueValuesOperator
3956 {
3957     auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]];
3958     auto col2File = [["", "50"], ["", "50"], ["xyz", "52"]];
3959     auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "-"]];
3960 
3961     testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e", "a||b|cd|e"]);
3962     testSingleFieldOperator!UniqueValuesOperator(col2File, 0, "unique_values", ["", "", "", "|xyz"]);
3963     testSingleFieldOperator!UniqueValuesOperator(col2File, 1, "unique_values", ["", "50", "50", "50|52"]);
3964     testSingleFieldOperator!UniqueValuesOperator(col3File, 0, "unique_values", ["", "z", "z|y", "z|y|w"]);
3965     testSingleFieldOperator!UniqueValuesOperator(col3File, 1, "unique_values", ["", "a", "a|ab", "a|ab|ba"]);
3966     testSingleFieldOperator!UniqueValuesOperator(col3File, 2, "unique_values", ["", "-", "-|--", "-|--"]);
3967 
3968     testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e"],
3969                                                  new MissingFieldPolicy(true, ""));  // Exclude missing
3970 
3971 
3972     testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e", "a|X|b|cd|e"],
3973                                                  new MissingFieldPolicy(false, "X"));  // Replace missing
3974 }