1 /** 2 Command line tool that reads TSV files and summarizes field values associated with 3 equivalent keys. 4 5 Copyright (c) 2016-2019, eBay Software Foundation 6 Initially written by Jon Degenhardt 7 8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 9 */ 10 module tsv_utils.tsv_summarize; 11 12 import std.algorithm : all, any, canFind, each, find, findSplit, map, joiner, splitter; 13 import std.array : join; 14 import std.conv : to; 15 import std.format : format; 16 import std.range; 17 import std.stdio; 18 import std.typecons : tuple; 19 import std.container : DList; 20 21 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 22 23 version(unittest) 24 { 25 // When running unit tests, use main from -main compiler switch. 26 } 27 else 28 { 29 int main(string[] cmdArgs) 30 { 31 /* When running in DMD code coverage mode, turn on report merging. */ 32 version(D_Coverage) version(DigitalMars) 33 { 34 import core.runtime : dmd_coverSetMerge; 35 dmd_coverSetMerge(true); 36 } 37 38 TsvSummarizeOptions cmdopt; 39 auto r = cmdopt.processArgs(cmdArgs); 40 if (!r[0]) return r[1]; 41 version(LDC_Profile) 42 { 43 import ldc.profile : resetAll; 44 resetAll(); 45 } 46 try tsvSummarize(cmdopt, cmdArgs[1..$]); 47 catch (Exception exc) 48 { 49 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 50 return 1; 51 } 52 return 0; 53 } 54 } 55 56 auto helpTextVerbose = q"EOS 57 Synopsis: tsv-summarize [options] file [file...] 58 59 tsv-summarize reads tabular data files (tab-separated by default), tracks 60 field values for each unique key, and runs summarization algorithms. Consider 61 the file data.tsv: 62 63 make color time 64 ford blue 131 65 chevy green 124 66 ford red 128 67 bmw black 118 68 bmw black 126 69 ford blue 122 70 71 The min and average times for each make is generated by the command: 72 73 $ tsv-summarize --header --group-by 1 --min 3 --mean 3 data.tsv 74 75 This produces: 76 77 make time_min time_mean 78 ford 122 127 79 chevy 124 124 80 bmw 118 122 81 82 Using '--group 1,2' will group by both 'make' and 'color'. Omitting the 83 '--group-by' entirely summarizes fields for full file. 84 85 The program tries to generate useful headers, but custom headers can be 86 specified. Example (using -g and -H shortcuts for --header and --group-by): 87 88 $ tsv-summarize -H -g 1 --min 3:fastest --mean 3:average data.tsv 89 90 Most operators take custom headers in a similarly way, generally following: 91 92 --<operator-name> FIELD[:header] 93 94 Operators can be specified multiple times. They can also take multiple 95 fields (though not when a custom header is specified). Examples: 96 97 --median 2,3,4 98 --median 2-5,7-11 99 100 The quantile operator requires one or more probabilities after the fields: 101 102 --quantile 2:0.25 // Quantile 1 of field 2 103 --quantile 2-4:0.25,0.5,0.75 // Q1, Median, Q3 of fields 2, 3, 4 104 105 Summarization operators available are: 106 count range mad values 107 retain sum var unique-values 108 first mean stddev unique-count 109 last median mode missing-count 110 min quantile mode-count not-missing-count 111 max 112 113 Numeric values are printed to 12 significant digits by default. This can be 114 changed using the '--p|float-precision' option. If six or less it sets the 115 number of significant digits after the decimal point. If greater than six it 116 sets the total number of significant digits. 117 118 Calculations hold onto the minimum data needed while reading data. A few 119 operations like median keep all data values in memory. These operations will 120 start to encounter performance issues as available memory becomes scarce. The 121 size that can be handled effectively is machine dependent, but often quite 122 large files can be handled. 123 124 Operations requiring numeric entries will signal an error and terminate 125 processing if a non-numeric entry is found. 126 127 Missing values are not treated specially by default, this can be changed 128 using the '--x|exclude-missing' or '--r|replace-missing' option. The former 129 turns off processing for missing values, the latter uses a replacement value. 130 131 Options: 132 EOS"; 133 134 auto helpText = q"EOS 135 Synopsis: tsv-summarize [options] file [file...] 136 137 tsv-summarize runs aggregation operations on fields in tab-separated value 138 files. Operations can be run against the full input data or grouped by key 139 fields. Use --help-verbose for more extensive help. 140 141 Options: 142 EOS"; 143 144 /** Command line options - Container and processing. The processArgs method is used to 145 * process the command line. 146 */ 147 struct TsvSummarizeOptions { 148 string programName; 149 150 /* Options set directly by on the command line.. */ 151 size_t[] keyFields; // -g, --group-by 152 bool hasHeader = false; // --header 153 bool writeHeader = false; // -w, --write-header 154 char inputFieldDelimiter = '\t'; // --d|delimiter 155 char valuesDelimiter = '|'; // --v|values-delimiter 156 size_t floatPrecision = 12; // --p|float-precision 157 bool excludeMissing = false; // --x|exclude-missing 158 string missingValueReplacement; // --r|replace-missing 159 bool helpVerbose = false; // --help-verbose 160 bool versionWanted = false; // --V|version 161 DList!Operator operators; // Operators, in the order specified. 162 size_t endFieldIndex = 0; // Derived value. Max field index used plus one. 163 MissingFieldPolicy globalMissingPolicy = new MissingFieldPolicy; // Derived value. 164 165 /* Returns a tuple. First value is true if command line arguments were successfully 166 * processed and execution should continue, or false if an error occurred or the user 167 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 168 * 169 * Returning true (execution continues) means args have been validated and derived 170 * values calculated. In addition, field indices have been converted to zero-based. 171 */ 172 auto processArgs (ref string[] cmdArgs) { 173 import std.algorithm : any, each; 174 import std.getopt; 175 import std.path : baseName, stripExtension; 176 import std.typecons : Yes, No; 177 import tsv_utils.common.getopt_inorder; 178 import tsv_utils.common.utils : makeFieldListOptionHandler; 179 180 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 181 182 try 183 { 184 arraySep = ","; // Use comma to separate values in command line options 185 auto r = getoptInorder( 186 cmdArgs, 187 "help-verbose", " Print full help.", &helpVerbose, 188 189 std.getopt.config.caseSensitive, 190 "V|version", " Print version information and exit.", &versionWanted, 191 std.getopt.config.caseInsensitive, 192 193 "g|group-by", "<field-list> Fields to use as key.", 194 keyFields.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex), 195 196 std.getopt.config.caseSensitive, 197 "H|header", " Treat the first line of each file as a header.", &hasHeader, 198 std.getopt.config.caseInsensitive, 199 200 "w|write-header", " Write an output header even if there is no input header.", &writeHeader, 201 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &inputFieldDelimiter, 202 "v|values-delimiter", "CHR Values delimiter. Default: vertical bar (|). (Single byte UTF-8 characters only.)", &valuesDelimiter, 203 "p|float-precision", "NUM 'Precision' to use printing floating point numbers. Affects the number of digits printed and exponent use. Default: 12", &floatPrecision, 204 "x|exclude-missing", " Exclude missing (empty) fields from calculations.", &excludeMissing, 205 "r|replace-missing", "STR Replace missing (empty) fields with STR in calculations.", &missingValueReplacement, 206 "count", " Count occurrences of each unique key.", &countOptionHandler, 207 "count-header", "STR Count occurrences of each unique key, use header STR.", &countHeaderOptionHandler, 208 "retain", "<field-list> Retain one copy of the field.", &operatorOptionHandler!RetainOperator, 209 "first", "<field-list>[:STR] First value seen.", &operatorOptionHandler!FirstOperator, 210 "last", "<field-list>[:STR] Last value seen.", &operatorOptionHandler!LastOperator, 211 "min", "<field-list>[:STR] Min value. (Numeric fields only.)", &operatorOptionHandler!MinOperator, 212 "max", "<field-list>[:STR] Max value. (Numeric fields only.)", &operatorOptionHandler!MaxOperator, 213 "range", "<field-list>[:STR] Difference between min and max values. (Numeric fields only.)", &operatorOptionHandler!RangeOperator, 214 "sum", "<field-list>[:STR] Sum of the values. (Numeric fields only.)", &operatorOptionHandler!SumOperator, 215 "mean", "<field-list>[:STR] Mean (average). (Numeric fields only.)", &operatorOptionHandler!MeanOperator, 216 "median", "<field-list>[:STR] Median value. (Numeric fields only. Reads all values into memory.)", &operatorOptionHandler!MedianOperator, 217 "quantile", "<field-list>:p[,p...][:STR] Quantiles. One or more fields, then one or more 0.0-1.0 probabilities. (Numeric fields only. Reads all values into memory.)", &quantileOperatorOptionHandler, 218 "mad", "<field-list>[:STR] Median absolute deviation from the median. Raw value, not scaled. (Numeric fields only. Reads all values into memory.)", &operatorOptionHandler!MadOperator, 219 "var", "<field-list>[:STR] Variance. (Sample variance, numeric fields only).", &operatorOptionHandler!VarianceOperator, 220 "stdev", "<field-list>[:STR] Standard deviation. (Sample st.dev, numeric fields only).", &operatorOptionHandler!StDevOperator, 221 "mode", "<field-list>[:STR] Mode. The most frequent value. (Reads all unique values into memory.)", &operatorOptionHandler!ModeOperator, 222 "mode-count", "<field-list>[:STR] Count of the most frequent value. (Reads all unique values into memory.)", &operatorOptionHandler!ModeCountOperator, 223 "unique-count", "<field-list>[:STR] Number of unique values. (Reads all unique values into memory.)", &operatorOptionHandler!UniqueCountOperator, 224 "missing-count", "<field-list>[:STR] Number of missing (empty) fields. Not affected by '--x|exclude-missing' or '--r|replace-missing'.", &operatorOptionHandler!MissingCountOperator, 225 "not-missing-count", "<field-list>[:STR] Number of filled (non-empty) fields. Not affected by '--r|replace-missing'.", &operatorOptionHandler!NotMissingCountOperator, 226 "values", "<field-list>[:STR] All the values, separated by --v|values-delimiter. (Reads all values into memory.)", &operatorOptionHandler!ValuesOperator, 227 "unique-values", "<field-list>[:STR] All the unique values, separated by --v|values-delimiter. (Reads all unique values into memory.)", &operatorOptionHandler!UniqueValuesOperator, 228 ); 229 230 if (r.helpWanted) 231 { 232 defaultGetoptPrinter(helpText, r.options); 233 return tuple(false, 0); 234 } 235 else if (helpVerbose) 236 { 237 defaultGetoptPrinter(helpTextVerbose, r.options); 238 return tuple(false, 0); 239 } 240 else if (versionWanted) 241 { 242 import tsv_utils.common.tsvutils_version; 243 writeln(tsvutilsVersionNotice("tsv-summarize")); 244 return tuple(false, 0); 245 } 246 247 consistencyValidations(); 248 derivations(); 249 } 250 catch (Exception exc) 251 { 252 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 253 return tuple(false, 1); 254 } 255 return tuple(true, 0); 256 } 257 258 /* operationOptionHandler functions are callbacks that process command line options 259 * specifying summarization operations. eg. '--max 5', '--last 3:LastEntry'. Handlers 260 * check syntactic correctness and instantiate Operator objects that do the work. This 261 * is also where 1-upped field numbers are converted to 0-based indices. 262 */ 263 private void operatorOptionHandler(OperatorClass : SingleFieldOperator)(string option, string optionVal) 264 { 265 import std.range : enumerate; 266 import std.typecons : Yes, No; 267 import tsv_utils.common.utils : parseFieldList; 268 269 auto valSplit = findSplit(optionVal, ":"); 270 271 if (valSplit[0].empty || (!valSplit[1].empty && valSplit[2].empty)) 272 { 273 throw new Exception( 274 format("Invalid option value: '--%s %s'. Expected: '--%s <field-list>' or '--%s <field>:<header>'.", 275 option, optionVal, option, option)); 276 } 277 278 try foreach (fieldNum, fieldIndex; 279 valSplit[0].to!string 280 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex).enumerate(1)) 281 { 282 auto op = new OperatorClass(fieldIndex, globalMissingPolicy); 283 284 if (!valSplit[2].empty) // Header specified 285 { 286 if (fieldNum > 1) 287 { 288 throw new Exception( 289 format("Invalid option: '--%s %s'. Cannot specify a custom header when using multiple fields.", 290 option, optionVal)); 291 } 292 else if (!op.allowCustomHeader) 293 { 294 throw new Exception( 295 format("Invalid option: '--%s %s'. Operator does not support custom headers.", 296 option, optionVal)); 297 } 298 299 op.setCustomHeader(valSplit[2].to!string); 300 } 301 302 operators.insertBack(op); 303 if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1; 304 } 305 catch (Exception exc) 306 { 307 import std.format : format; 308 exc.msg = format("[--%s] %s", option, exc.msg); 309 throw exc; 310 } 311 } 312 313 /* QuantileOperator has a different syntax and needs a custom command option handler. */ 314 private void quantileOperatorOptionHandler(string option, string optionVal) 315 { 316 import std.typecons : Yes, No; 317 import tsv_utils.common.utils : parseFieldList; 318 319 auto formatErrorMsg(string option, string optionVal) 320 { 321 return format( 322 "Invalid option value: '--%s %s'. Expected: '--%s <field-list>:<prob>[,<prob>]' or '--%s <field>:<prob>:<header>' where <prob> is a number between 0.0 and 1.0.", 323 option, optionVal, option, option); 324 } 325 326 auto split1 = findSplit(optionVal, ":"); 327 328 if (split1[0].empty || (!split1[1].empty && split1[2].empty)) 329 throw new Exception(formatErrorMsg(option, optionVal)); 330 331 auto split2 = findSplit(split1[2], ":"); 332 333 if (split2[0].empty || (!split2[1].empty && split2[2].empty)) 334 throw new Exception(formatErrorMsg(option, optionVal)); 335 336 auto fieldStr = split1[0]; 337 auto probStr = split2[0]; 338 auto header = split2[2]; 339 340 size_t[] fieldIndices; 341 double[] probs; 342 343 try foreach (fieldIndex; 344 fieldStr.to!string.parseFieldList!(size_t, Yes.convertToZeroBasedIndex)) 345 { 346 fieldIndices ~= fieldIndex; 347 } 348 catch (Exception exc) 349 { 350 import std.format : format; 351 exc.msg = format("[--%s] %s", option, exc.msg); 352 throw exc; 353 } 354 355 foreach (str; probStr.splitter(',')) 356 { 357 double p; 358 359 try p = str.to!double; 360 catch (Exception exc) 361 throw new Exception(formatErrorMsg(option, optionVal)); 362 363 if (!(p >= 0.0 && p <= 1.0)) 364 throw new Exception( 365 format("Invalid option: '--%s %s'. Probability '%g' is not in the interval [0.0,1.0].", 366 option, optionVal, p)); 367 368 probs ~= p; 369 } 370 371 if (!header.empty && (fieldIndices.length > 1 || probs.length > 1)) 372 { 373 throw new Exception( 374 format("Invalid option: '--%s %s'. Cannot specify a custom header when using multiple fields or multiple probabilities.", 375 option, optionVal)); 376 } 377 378 assert (fieldIndices.length > 0); 379 assert (probs.length > 0); 380 assert (header.empty || (fieldIndices.length == 1 && probs.length == 1)); 381 382 foreach (fieldIndex; fieldIndices) 383 { 384 foreach (p; probs) 385 { 386 auto op = new QuantileOperator(fieldIndex, globalMissingPolicy, p); 387 if (!header.empty) op.setCustomHeader(header); 388 operators.insertBack(op); 389 } 390 if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1; 391 } 392 } 393 394 private void countOptionHandler() 395 { 396 operators.insertBack(new CountOperator()); 397 } 398 399 private void countHeaderOptionHandler(string option, string optionVal) 400 { 401 auto op = new CountOperator(); 402 op.setCustomHeader(optionVal); 403 operators.insertBack(op); 404 } 405 406 /* This routine does validations not handled by processArgs. */ 407 private void consistencyValidations() 408 { 409 if (operators.empty) 410 { 411 throw new Exception("At least one summary operator is required."); 412 } 413 414 if (inputFieldDelimiter == valuesDelimiter) 415 { 416 throw new Exception("Cannot use the same character for both --d|field-delimiter and --v|values-delimiter."); 417 } 418 419 if (excludeMissing && missingValueReplacement.length != 0) 420 { 421 throw new Exception("Cannot use both '--x|exclude-missing' and '--r|replace-missing'."); 422 } 423 } 424 425 /* Post-processing derivations. */ 426 void derivations() 427 { 428 /* keyFields need to part of the endFieldIndex, which is one past the last field index. */ 429 keyFields.each!(delegate (size_t x) { if (x >= endFieldIndex) endFieldIndex = x + 1; } ); 430 431 /* Missing field policy. */ 432 globalMissingPolicy.updatePolicy(excludeMissing, missingValueReplacement); 433 } 434 } 435 436 /** tsvSummarize does the primary work of the tsv-summarize program. 437 */ 438 void tsvSummarize(TsvSummarizeOptions cmdopt, in string[] inputFiles) 439 { 440 import tsv_utils.common.utils : bufferedByLine, throwIfWindowsNewlineOnUnix; 441 442 /* Pick the Summarizer based on the number of key-fields entered. */ 443 auto summarizer = 444 (cmdopt.keyFields.length == 0) 445 ? new NoKeySummarizer!(typeof(stdout.lockingTextWriter()))( 446 cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 447 448 : (cmdopt.keyFields.length == 1) 449 ? new OneKeySummarizer!(typeof(stdout.lockingTextWriter()))( 450 cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 451 452 : new MultiKeySummarizer!(typeof(stdout.lockingTextWriter()))( 453 cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy); 454 455 /* Add the operators to the Summarizer. */ 456 summarizer.setOperators(inputRangeObject(cmdopt.operators[])); 457 458 /* Process each input file, one line at a time. */ 459 auto lineFields = new char[][](cmdopt.endFieldIndex); 460 bool headerFound = false; 461 foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"]) 462 { 463 auto inputStream = (filename == "-") ? stdin : filename.File(); 464 foreach (lineNum, line; inputStream.bufferedByLine.enumerate(1)) 465 { 466 if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, lineNum); 467 468 /* Copy the needed number of fields to the fields array. 469 * Note: The number is zero if no operator needs fields. Notably, the count 470 * operator. Used by itself, it counts the number input lines (ala 'wc -l'). 471 */ 472 if (cmdopt.endFieldIndex > 0) 473 { 474 size_t fieldIndex = 0; 475 foreach (fieldValue; line.splitter(cmdopt.inputFieldDelimiter)) 476 { 477 if (fieldIndex == cmdopt.endFieldIndex) break; 478 lineFields[fieldIndex] = fieldValue; 479 fieldIndex++; 480 } 481 482 if (fieldIndex == 0) 483 { 484 assert(cmdopt.endFieldIndex > 0); 485 assert(line.length == 0); 486 487 /* Bug work-around. Empty lines are not handled properly by splitter. 488 * - Bug: https://issues.dlang.org/show_bug.cgi?id=15735 489 * - Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030 490 * This can arise for: '$ tsv-summarize -g 1 --count'. This counts the 491 * unique values in field 1. If there's only one column, then an empty 492 * line becomes an empty string for field 1. Work-around: Point to the 493 * line. It's an empty string. 494 */ 495 lineFields[fieldIndex] = line; 496 fieldIndex++; 497 } 498 499 if (fieldIndex < cmdopt.endFieldIndex) 500 { 501 throw new Exception( 502 format("Not enough fields in line. File: %s, Line: %s", 503 (filename == "-") ? "Standard Input" : filename, lineNum)); 504 } 505 } 506 507 if (cmdopt.hasHeader && lineNum == 1) 508 { 509 if (!headerFound) 510 { 511 summarizer.processHeaderLine(lineFields); 512 headerFound = true; 513 } 514 } 515 else 516 { 517 /* Process the line. Processing will fail (throw) if a field cannot be 518 * converted to the expected type. 519 */ 520 try summarizer.processNextLine(lineFields); 521 catch (Exception exc) 522 { 523 throw new Exception( 524 format("Could not process line or field: %s\n File: %s Line: %s%s", 525 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum, 526 (lineNum == 1) ? "\n Is this a header line? Use --header to skip." : "")); 527 } 528 } 529 } 530 } 531 532 debug writeln("[tsvSummarize] After reading all data."); 533 534 /* Whew! We're done processing input data. Run the calculations and print. */ 535 auto printOptions = SummarizerPrintOptions( 536 cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision); 537 auto stdoutWriter = stdout.lockingTextWriter; 538 539 if (cmdopt.hasHeader || cmdopt.writeHeader) 540 { 541 summarizer.writeSummaryHeader(stdoutWriter, printOptions); 542 } 543 544 summarizer.writeSummaryBody(stdoutWriter, printOptions); 545 } 546 547 /** The default field header. This is used when the input doesn't have field headers, 548 * but field headers are used in the output. The default is "fieldN", where N is the 549 * 1-upped field number. 550 */ 551 string fieldHeaderFromIndex(size_t fieldIndex) 552 { 553 enum prefix = "field"; 554 return prefix ~ (fieldIndex + 1).to!string; 555 } 556 557 unittest 558 { 559 assert(fieldHeaderFromIndex(0) == "field1"); 560 assert(fieldHeaderFromIndex(10) == "field11"); 561 } 562 563 /** Produce a summary header from a field header. 564 * 565 * The result has the form `<fieldHeader>_<operation>`. e.g. If the field header is 566 * "length" and the operation is "max", the summary header is "length_max". The field 567 * header typically comes a header line in the input data or was constructed by 568 * fieldHeaderFromIndex(). 569 * 570 * If operationName is the empty string, then fieldHeader is used unchanged. This supports 571 * the Retain operator. 572 */ 573 string summaryHeaderFromFieldHeader(string fieldHeader, string operationName) 574 { 575 return (operationName.length > 0) ? fieldHeader ~ "_" ~ operationName : fieldHeader; 576 } 577 578 unittest 579 { 580 assert(summaryHeaderFromFieldHeader("originalfield", "mycalc") == "originalfield_mycalc"); 581 assert(summaryHeaderFromFieldHeader("originalfield", "") == "originalfield"); 582 } 583 584 /** SummarizerPrintOptions holds printing options for Summarizers and Calculators. Typically 585 * specified with command line options, it is separated out for modularity. 586 */ 587 struct SummarizerPrintOptions 588 { 589 char fieldDelimiter; 590 char valuesDelimiter; 591 size_t floatPrecision = 12; 592 593 import std.traits : isFloatingPoint, isIntegral; 594 595 auto formatNumber(T)(T n) const 596 if (isFloatingPoint!T || isIntegral!T) 597 { 598 import tsv_utils.common.numerics : formatNumber; 599 return formatNumber!T(n, floatPrecision); 600 } 601 } 602 603 /** A Summarizer object maintains the state of the summarization and performs basic 604 * processing. Handling of files and input lines is left to the caller. 605 * 606 * Classes supporting the Summarizer must implement the methods: 607 * - setOperators - Called after initializing the object for each operator to be processed. 608 * - processHeaderLine - Called to process the header line of each file. Returns true if 609 * it was the first header line processed (used when reading multiple files). 610 * - processNextLine - Called to process non-header lines. 611 * - writeSummaryHeader - Called to write the header line. 612 * - writeSummaryBody - Called to write the result lines. 613 * 614 */ 615 interface Summarizer(OutputRange) 616 { 617 /** Called after initializing the object for each operator to be processed. */ 618 void setOperators(InputRange!Operator op); 619 620 /** Called to process the header line of each file. Returns true if it was the 621 * first header line processed (used when reading multiple files). 622 */ 623 bool processHeaderLine(const char[][] lineFields); 624 625 /** Called to process non-header lines. */ 626 void processNextLine(const char[][] lineFields); 627 628 /** Called to write the header line. */ 629 void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions); 630 631 /** Called to write the result lines. */ 632 void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions); 633 } 634 635 /** SummarizerBase performs work shared by all sumarizers, most everything except for 636 * handling of unique keys. 637 * 638 * The base class handles creation, allocates storage for Operators and SharedFieldValues, 639 * and similar. Derived classes deal primarily with unique keys and the associated Calculators 640 * and UniqueKeyValuesLists. 641 */ 642 class SummarizerBase(OutputRange) : Summarizer!OutputRange 643 { 644 private char _inputFieldDelimiter; 645 private bool _hasProcessedFirstHeaderLine = false; 646 private SharedFieldValues _sharedFieldValues = null; // Null if no shared field value lists. 647 protected MissingFieldPolicy _missingPolicy; 648 protected DList!Operator _operators; 649 protected size_t _numOperators = 0; 650 651 this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 652 { 653 _inputFieldDelimiter = inputFieldDelimiter; 654 _missingPolicy = missingPolicy; 655 } 656 657 char inputFieldDelimiter() const @property 658 { 659 return _inputFieldDelimiter; 660 } 661 662 /** Sets the Operators used by the Summarizer. Called after construction. */ 663 void setOperators(InputRange!Operator operators) 664 { 665 foreach (op; operators) 666 { 667 _operators.insertBack(op); 668 _numOperators++; 669 auto numericFieldsToSave = op.numericFieldsToSave(); 670 auto textFieldsToSave = op.textFieldsToSave(); 671 672 if (numericFieldsToSave.length > 0 || textFieldsToSave.length > 0) 673 { 674 if (_sharedFieldValues is null) 675 { 676 _sharedFieldValues = new SharedFieldValues(); 677 } 678 numericFieldsToSave.each!(x => _sharedFieldValues.addNumericIndex(x)); 679 textFieldsToSave.each!(x => _sharedFieldValues.addTextIndex(x)); 680 } 681 } 682 } 683 684 /** Called to process the header line of each file. Returns true if it was the 685 * first header line processed (used when reading multiple files). 686 */ 687 bool processHeaderLine(const char[][] lineFields) 688 { 689 if (!_hasProcessedFirstHeaderLine) 690 { 691 _operators.each!(x => x.processHeaderLine(lineFields)); 692 _hasProcessedFirstHeaderLine = true; 693 return true; 694 } 695 else 696 { 697 return false; 698 } 699 } 700 701 protected final UniqueKeyValuesLists makeUniqueKeyValuesLists() 702 { 703 return (_sharedFieldValues is null) 704 ? null 705 : _sharedFieldValues.makeUniqueKeyValuesLists; 706 } 707 708 abstract void processNextLine(const char[][] lineFields); 709 abstract void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions); 710 abstract void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions); 711 } 712 713 /** The NoKeySummarizer is used when summarizing values across the entire input. 714 * 715 * Note: NoKeySummarizer is used in Operator unit tests and gets extensive testing 716 * through that mechanism. 717 */ 718 final class NoKeySummarizer(OutputRange) : SummarizerBase!OutputRange 719 { 720 private Calculator[] _calculators; 721 private UniqueKeyValuesLists _valueLists; 722 723 this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 724 { 725 super(inputFieldDelimiter, missingPolicy); 726 } 727 728 /** Called after initializing the object for each operator to be processed. */ 729 override void setOperators(InputRange!Operator operators) 730 { 731 super.setOperators(operators); 732 733 /* Only one Calculator per Operation, so create them as Operators are added. */ 734 foreach (op; operators) _calculators ~= op.makeCalculator; 735 _valueLists = super.makeUniqueKeyValuesLists(); 736 } 737 738 /** Called to process non-header lines. */ 739 override void processNextLine(const char[][] lineFields) 740 { 741 _calculators.each!(x => x.processNextLine(lineFields)); 742 if (_valueLists !is null) _valueLists.processNextLine(lineFields, _missingPolicy); 743 } 744 745 /** Called to write the header line. */ 746 override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 747 { 748 put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter)); 749 put(outputStream, '\n'); 750 } 751 752 /** Called to write the result lines. */ 753 override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 754 { 755 put(outputStream, 756 _calculators[] 757 .map!(x => x.calculate(_valueLists, printOptions)) 758 .join(printOptions.fieldDelimiter)); 759 put(outputStream, '\n'); 760 } 761 } 762 763 /** KeySummarizerBase does work shared by the single key and multi-key summarizers. 764 * 765 * The primary difference between those two is the formation of the key. The primary 766 * reason for separating those into two separate classes is to simplify (speed-up) 767 * handling of single field keys, which are the most common use case. 768 */ 769 class KeySummarizerBase(OutputRange) : SummarizerBase!OutputRange 770 { 771 protected struct UniqueKeyData 772 { 773 Calculator[] calculators; 774 UniqueKeyValuesLists valuesLists; 775 } 776 777 private DList!string _uniqueKeys; 778 private UniqueKeyData[string] _uniqueKeyData; 779 780 this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 781 { 782 super(inputFieldDelimiter, missingPolicy); 783 } 784 785 protected void processNextLineWithKey(T : const char[])(T key, const char[][] lineFields) 786 { 787 debug writefln("[%s]: %s", __FUNCTION__, lineFields.to!string); 788 789 auto dataPtr = (key in _uniqueKeyData); 790 auto data = (dataPtr is null) ? addUniqueKey(key.to!string) : *dataPtr; 791 792 data.calculators.each!(x => x.processNextLine(lineFields)); 793 if (data.valuesLists !is null) data.valuesLists.processNextLine(lineFields, _missingPolicy); 794 } 795 796 protected UniqueKeyData addUniqueKey(string key) 797 { 798 assert(key !in _uniqueKeyData); 799 800 _uniqueKeys.insertBack(key); 801 802 auto calculators = new Calculator[_numOperators]; 803 size_t i = 0; 804 foreach (op; _operators) 805 { 806 calculators[i] = op.makeCalculator; 807 i++; 808 } 809 810 return _uniqueKeyData[key] = UniqueKeyData(calculators, super.makeUniqueKeyValuesLists()); 811 } 812 813 override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 814 { 815 put(outputStream, keyFieldHeader()); 816 put(outputStream, printOptions.fieldDelimiter); 817 put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter)); 818 put(outputStream, '\n'); 819 } 820 821 override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 822 { 823 foreach(key; _uniqueKeys) 824 { 825 auto data = _uniqueKeyData[key]; 826 put(outputStream, key); 827 put(outputStream, printOptions.fieldDelimiter); 828 put(outputStream, 829 data.calculators[] 830 .map!(x => x.calculate(data.valuesLists, printOptions)) 831 .join(printOptions.fieldDelimiter)); 832 put(outputStream, '\n'); 833 } 834 } 835 836 abstract string keyFieldHeader() const @property; 837 } 838 839 /** This Summarizer is for the case where the unique key is based on exactly one field. 840 */ 841 final class OneKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange 842 { 843 private size_t _keyFieldIndex = 0; 844 private string _keyFieldHeader; 845 private DList!string _uniqueKeys; 846 847 this(size_t keyFieldIndex, char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 848 { 849 super(inputFieldDelimiter, missingPolicy); 850 _keyFieldIndex = keyFieldIndex; 851 _keyFieldHeader = fieldHeaderFromIndex(keyFieldIndex); 852 } 853 854 override string keyFieldHeader() const @property 855 { 856 return _keyFieldHeader; 857 } 858 859 override bool processHeaderLine(const char[][] lineFields) 860 { 861 assert(_keyFieldIndex <= lineFields.length); 862 863 bool isFirstHeaderLine = super.processHeaderLine(lineFields); 864 if (isFirstHeaderLine) 865 { 866 _keyFieldHeader = lineFields[_keyFieldIndex].to!string; 867 } 868 return isFirstHeaderLine; 869 } 870 871 override void processNextLine(const char[][] lineFields) 872 { 873 assert(_keyFieldIndex < lineFields.length); 874 processNextLineWithKey(lineFields[_keyFieldIndex], lineFields); 875 } 876 } 877 878 /** This Summarizer is for the case where the unique key is based on multiple fields. 879 */ 880 final class MultiKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange 881 { 882 private size_t[] _keyFieldIndices; 883 private string _keyFieldHeader; 884 private DList!string _uniqueKeys; 885 886 this(const size_t[] keyFieldIndices, char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 887 { 888 super(inputFieldDelimiter, missingPolicy); 889 _keyFieldIndices = keyFieldIndices.dup; 890 _keyFieldHeader = 891 _keyFieldIndices.map!(i => fieldHeaderFromIndex(i)) 892 .join(inputFieldDelimiter); 893 } 894 895 override string keyFieldHeader() const @property 896 { 897 return _keyFieldHeader; 898 } 899 900 override bool processHeaderLine(const char[][] lineFields) 901 { 902 assert(_keyFieldIndices.all!(x => x < lineFields.length)); 903 assert(_keyFieldIndices.length >= 2); 904 905 bool isFirstHeaderLine = super.processHeaderLine(lineFields); 906 if (isFirstHeaderLine) 907 { 908 _keyFieldHeader = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string; 909 } 910 return isFirstHeaderLine; 911 } 912 913 override void processNextLine(const char[][] lineFields) 914 { 915 assert(_keyFieldIndices.all!(x => x < lineFields.length)); 916 assert(_keyFieldIndices.length >= 2); 917 918 string key = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string; 919 processNextLineWithKey(key, lineFields); 920 } 921 } 922 923 version(unittest) 924 { 925 /* testSummarizer is a helper that can run many types of unit tests against 926 * Summarizers. It can also test operators, but there are separate helper functions 927 * better suited for that purpose. 928 * 929 * Arguments are a command line args, an input file, and expected output. The 930 * input file and expected output are already split into lines and fields, the helper 931 * manages re-assembly. The program name from the command line args is printed if an 932 * an error occurs, it is useful to identify the test that failed. 933 * 934 * Note: Much of this is a duplication tsvSummarize logic. Better abstraction of 935 * file input/output would enable running unit tests directly on top of tsvSummarize. 936 */ 937 void testSummarizer(string[] cmdArgs, string[][] file, string[][] expected) 938 { 939 import std.array : appender; 940 941 assert(cmdArgs.length > 0, "[testSummarizer] cmdArgs must not be empty."); 942 943 auto formatAssertMessage(T...)(string msg, T formatArgs) 944 { 945 auto formatString = "[testSummarizer] %s: " ~ msg; 946 return format(formatString, cmdArgs[0], formatArgs); 947 } 948 949 TsvSummarizeOptions cmdopt; 950 auto savedCmdArgs = cmdArgs.to!string; 951 auto r = cmdopt.processArgs(cmdArgs); 952 assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); 953 954 assert(file.all!(line => line.length >= cmdopt.endFieldIndex), 955 formatAssertMessage("group-by or operator field number greater than number of fields a line of the input file.")); 956 957 /* Pick the Summarizer based on the number of key-fields entered. */ 958 auto summarizer = 959 (cmdopt.keyFields.length == 0) 960 ? new NoKeySummarizer!(typeof(appender!(char[])()))( 961 cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 962 963 : (cmdopt.keyFields.length == 1) 964 ? new OneKeySummarizer!(typeof(appender!(char[])()))( 965 cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 966 967 : new MultiKeySummarizer!(typeof(appender!(char[])()))( 968 cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy); 969 970 /* Add the operators to the Summarizer. */ 971 summarizer.setOperators(inputRangeObject(cmdopt.operators[])); 972 973 /* Process the file one line at a time. */ 974 auto lineFields = new char[][](cmdopt.endFieldIndex); 975 bool headerFound = false; 976 foreach (lineNum, line; file.enumerate(1)) 977 { 978 /* Copy the needed fields to the fields array. */ 979 foreach (i, val; line[0..cmdopt.endFieldIndex]) lineFields[i] = val.dup; 980 981 if (cmdopt.hasHeader && lineNum == 1) 982 { 983 if (!headerFound) 984 { 985 summarizer.processHeaderLine(lineFields); 986 headerFound = true; 987 } 988 } 989 else 990 { 991 try summarizer.processNextLine(lineFields); 992 catch (Exception exc) 993 { 994 assert(false, formatAssertMessage(exc.msg)); 995 } 996 } 997 } 998 auto printOptions = SummarizerPrintOptions( 999 cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision); 1000 1001 auto summarizerOutput = appender!(char[])(); 1002 1003 if (cmdopt.hasHeader || cmdopt.writeHeader) 1004 { 1005 summarizer.writeSummaryHeader(summarizerOutput, printOptions); 1006 } 1007 1008 summarizer.writeSummaryBody(summarizerOutput, printOptions); 1009 auto expectedOutput = expected.map!(x => x.joiner(cmdopt.inputFieldDelimiter.to!string)).joiner("\n").to!string; 1010 if (expectedOutput.length > 0 && expectedOutput[$-1] != '\n') expectedOutput ~= "\n"; 1011 1012 assert(summarizerOutput.data == expectedOutput, 1013 formatAssertMessage( 1014 "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================", 1015 expectedOutput.to!string, summarizerOutput.data.to!string)); 1016 } 1017 } 1018 1019 unittest 1020 { 1021 /* Summarizer unit tests. Primarily single-key and multi-key summarizers. To a limited 1022 * extent, command line option handling (TsvSummarizeOptions). Individual operators 1023 * have separate tests, those tests test the no-key summarizer. The Values operator is 1024 * used in these tests. It engages a number of behaviors, and the results have limited 1025 * ambiguity. Using only one operator limits dependence on individual operators. 1026 */ 1027 1028 auto file1 = [["fld1", "fld2", "fld3"], 1029 ["a", "a", "3"], 1030 ["c", "a", "2b"], 1031 ["c", "bc", ""], 1032 ["a", "c", "2b"], 1033 ["", "bc", ""], 1034 ["c", "bc", "3"]]; 1035 1036 /* Single-key summarizer tests. 1037 */ 1038 testSummarizer(["unittest-sk-1", "--header", "--group-by", "1", "--values", "1"], 1039 file1, 1040 [["fld1", "fld1_values"], 1041 ["a", "a|a"], 1042 ["c", "c|c|c"], 1043 ["", ""]] 1044 ); 1045 testSummarizer(["unittest-sk-2", "-H", "--group-by", "1", "--values", "2"], 1046 file1, 1047 [["fld1", "fld2_values"], 1048 ["a", "a|c"], 1049 ["c", "a|bc|bc"], 1050 ["", "bc"]] 1051 ); 1052 testSummarizer(["unittest-sk-3", "-H", "-g", "1", "--values", "3"], 1053 file1, 1054 [["fld1", "fld3_values"], 1055 ["a", "3|2b"], 1056 ["c", "2b||3"], 1057 ["", ""]] 1058 ); 1059 testSummarizer(["unittest-sk-4", "-H", "--group-by", "1", "--values", "1,2,3"], 1060 file1, 1061 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1062 ["a", "a|a", "a|c", "3|2b"], 1063 ["c", "c|c|c", "a|bc|bc", "2b||3"], 1064 ["", "", "bc", ""]] 1065 ); 1066 testSummarizer(["unittest-sk-5", "-H", "--group-by", "1", "--values", "1-3"], 1067 file1, 1068 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1069 ["a", "a|a", "a|c", "3|2b"], 1070 ["c", "c|c|c", "a|bc|bc", "2b||3"], 1071 ["", "", "bc", ""]] 1072 ); 1073 testSummarizer(["unittest-sk-6", "-H", "--group-by", "1", "--values", "3,2,1"], 1074 file1, 1075 [["fld1", "fld3_values", "fld2_values", "fld1_values"], 1076 ["a", "3|2b", "a|c", "a|a"], 1077 ["c", "2b||3", "a|bc|bc", "c|c|c"], 1078 ["", "", "bc", ""]] 1079 ); 1080 testSummarizer(["unittest-sk-7", "-H", "--group-by", "1", "--values", "3-1"], 1081 file1, 1082 [["fld1", "fld3_values", "fld2_values", "fld1_values"], 1083 ["a", "3|2b", "a|c", "a|a"], 1084 ["c", "2b||3", "a|bc|bc", "c|c|c"], 1085 ["", "", "bc", ""]] 1086 ); 1087 testSummarizer(["unittest-sk-8", "-H", "--group-by", "2", "--values", "1"], 1088 file1, 1089 [["fld2", "fld1_values"], 1090 ["a", "a|c"], 1091 ["bc", "c||c"], 1092 ["c", "a"]] 1093 ); 1094 testSummarizer(["unittest-sk-9", "-H", "--group-by", "2", "--values", "2"], 1095 file1, 1096 [["fld2", "fld2_values"], 1097 ["a", "a|a"], 1098 ["bc", "bc|bc|bc"], 1099 ["c", "c"]] 1100 ); 1101 testSummarizer(["unittest-sk-10", "-H", "--group-by", "2", "--values", "3"], 1102 file1, 1103 [["fld2", "fld3_values"], 1104 ["a", "3|2b"], 1105 ["bc", "||3"], 1106 ["c", "2b"]] 1107 ); 1108 testSummarizer(["unittest-sk-11", "-H", "--group-by", "2", "--values", "1,3"], 1109 file1, 1110 [["fld2", "fld1_values", "fld3_values"], 1111 ["a", "a|c", "3|2b"], 1112 ["bc", "c||c", "||3"], 1113 ["c", "a", "2b"]] 1114 ); 1115 testSummarizer(["unittest-sk-12", "-H", "--group-by", "2", "--values", "3,1"], 1116 file1, 1117 [["fld2", "fld3_values", "fld1_values"], 1118 ["a", "3|2b", "a|c"], 1119 ["bc", "||3", "c||c"], 1120 ["c", "2b", "a"]] 1121 ); 1122 testSummarizer(["unittest-sk-13", "-H", "--group-by", "3", "--values", "1"], 1123 file1, 1124 [["fld3", "fld1_values"], 1125 ["3", "a|c"], 1126 ["2b", "c|a"], 1127 ["", "c|"]] 1128 ); 1129 testSummarizer(["unittest-sk-14", "-H", "--group-by", "3", "--values", "2"], 1130 file1, 1131 [["fld3", "fld2_values"], 1132 ["3", "a|bc"], 1133 ["2b", "a|c"], 1134 ["", "bc|bc"]] 1135 ); 1136 testSummarizer(["unittest-sk-15", "-H", "--group-by", "3", "--values", "1,2"], 1137 file1, 1138 [["fld3", "fld1_values", "fld2_values"], 1139 ["3", "a|c", "a|bc"], 1140 ["2b", "c|a", "a|c"], 1141 ["", "c|", "bc|bc"]] 1142 ); 1143 1144 /* Multi-key summarizer tests. 1145 */ 1146 testSummarizer(["unittest-mk-1", "--header", "--group-by", "1,2", "--values", "1"], 1147 file1, 1148 [["fld1", "fld2", "fld1_values"], 1149 ["a", "a", "a"], 1150 ["c", "a", "c"], 1151 ["c", "bc", "c|c"], 1152 ["a", "c", "a"], 1153 ["", "bc", ""]] 1154 ); 1155 testSummarizer(["unittest-mk-2", "-H", "--group-by", "1,2", "--values", "2"], 1156 file1, 1157 [["fld1", "fld2", "fld2_values"], 1158 ["a", "a", "a"], 1159 ["c", "a", "a"], 1160 ["c", "bc", "bc|bc"], 1161 ["a", "c", "c"], 1162 ["", "bc", "bc"]] 1163 ); 1164 testSummarizer(["unittest-mk-3", "-H", "--group-by", "1,2", "--values", "3"], 1165 file1, 1166 [["fld1", "fld2", "fld3_values"], 1167 ["a", "a", "3"], 1168 ["c", "a", "2b"], 1169 ["c", "bc", "|3"], 1170 ["a", "c", "2b"], 1171 ["", "bc", ""]] 1172 ); 1173 testSummarizer(["unittest-mk-4", "-H", "--group-by", "1,2", "--values", "3,1"], 1174 file1, 1175 [["fld1", "fld2", "fld3_values", "fld1_values"], 1176 ["a", "a", "3", "a"], 1177 ["c", "a", "2b", "c"], 1178 ["c", "bc", "|3", "c|c"], 1179 ["a", "c", "2b", "a"], 1180 ["", "bc", "", ""]] 1181 ); 1182 testSummarizer(["unittest-mk-5", "-H", "--group-by", "3,2", "--values", "1"], 1183 file1, 1184 [["fld3", "fld2", "fld1_values"], 1185 ["3", "a", "a"], 1186 ["2b", "a", "c"], 1187 ["", "bc", "c|"], 1188 ["2b", "c", "a"], 1189 ["3", "bc", "c"]] 1190 ); 1191 testSummarizer(["unittest-mk-6", "-H", "--group-by", "3-2", "--values", "1"], 1192 file1, 1193 [["fld3", "fld2", "fld1_values"], 1194 ["3", "a", "a"], 1195 ["2b", "a", "c"], 1196 ["", "bc", "c|"], 1197 ["2b", "c", "a"], 1198 ["3", "bc", "c"]] 1199 ); 1200 testSummarizer(["unittest-mk-7", "-H", "--group-by", "2,1,3", "--values", "2"], 1201 file1, 1202 [["fld2", "fld1", "fld3", "fld2_values"], 1203 ["a", "a", "3", "a"], 1204 ["a", "c", "2b", "a"], 1205 ["bc", "c", "", "bc"], 1206 ["c", "a", "2b", "c"], 1207 ["bc", "", "", "bc"], 1208 ["bc", "c", "3", "bc"]] 1209 ); 1210 1211 /* Missing policies. */ 1212 testSummarizer(["unittest-mis-1", "--header", "--group-by", "1", "--values", "1", "--exclude-missing"], 1213 file1, 1214 [["fld1", "fld1_values"], 1215 ["a", "a|a"], 1216 ["c", "c|c|c"], 1217 ["", ""]] 1218 ); 1219 testSummarizer(["unittest-mis-2", "-H", "--group-by", "1", "--values", "2", "-x"], 1220 file1, 1221 [["fld1", "fld2_values"], 1222 ["a", "a|c"], 1223 ["c", "a|bc|bc"], 1224 ["", "bc"]] 1225 ); 1226 testSummarizer(["unittest-mis-3", "-H", "-g", "1", "--values", "3", "-x"], 1227 file1, 1228 [["fld1", "fld3_values"], 1229 ["a", "3|2b"], 1230 ["c", "2b|3"], 1231 ["", ""]] 1232 ); 1233 testSummarizer(["unittest-mis-4", "-H", "--group-by", "1", "--values", "1,2,3", "-x"], 1234 file1, 1235 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1236 ["a", "a|a", "a|c", "3|2b"], 1237 ["c", "c|c|c", "a|bc|bc", "2b|3"], 1238 ["", "", "bc", ""]] 1239 ); 1240 testSummarizer(["unittest-mis-5", "--header", "--group-by", "1", "--values", "1", "--replace-missing", "NA"], 1241 file1, 1242 [["fld1", "fld1_values"], 1243 ["a", "a|a"], 1244 ["c", "c|c|c"], 1245 ["", "NA"]] 1246 ); 1247 testSummarizer(["unittest-mis-6", "-H", "--group-by", "1", "--values", "2", "-r", "NA"], 1248 file1, 1249 [["fld1", "fld2_values"], 1250 ["a", "a|c"], 1251 ["c", "a|bc|bc"], 1252 ["", "bc"]] 1253 ); 1254 testSummarizer(["unittest-mis-7", "-H", "-g", "1", "--values", "3", "-r", "NA"], 1255 file1, 1256 [["fld1", "fld3_values"], 1257 ["a", "3|2b"], 1258 ["c", "2b|NA|3"], 1259 ["", "NA"]] 1260 ); 1261 testSummarizer(["unittest-mis-8", "-H", "--group-by", "1", "--values", "1,2,3", "-r", "NA"], 1262 file1, 1263 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1264 ["a", "a|a", "a|c", "3|2b"], 1265 ["c", "c|c|c", "a|bc|bc", "2b|NA|3"], 1266 ["", "NA", "bc", "NA"]] 1267 ); 1268 testSummarizer(["unittest-mis-9", "-H", "--group-by", "1,2", "--values", "3,1", "-x"], 1269 file1, 1270 [["fld1", "fld2", "fld3_values", "fld1_values"], 1271 ["a", "a", "3", "a"], 1272 ["c", "a", "2b", "c"], 1273 ["c", "bc", "3", "c|c"], 1274 ["a", "c", "2b", "a"], 1275 ["", "bc", "", ""]] 1276 ); 1277 testSummarizer(["unittest-mis-10", "-H", "--group-by", "3,2", "--values", "1", "-x"], 1278 file1, 1279 [["fld3", "fld2", "fld1_values"], 1280 ["3", "a", "a"], 1281 ["2b", "a", "c"], 1282 ["", "bc", "c"], 1283 ["2b", "c", "a"], 1284 ["3", "bc", "c"]] 1285 ); 1286 testSummarizer(["unittest-mis-11", "-H", "--group-by", "2,1,3", "--values", "2", "-x"], 1287 file1, 1288 [["fld2", "fld1", "fld3", "fld2_values"], 1289 ["a", "a", "3", "a"], 1290 ["a", "c", "2b", "a"], 1291 ["bc", "c", "", "bc"], 1292 ["c", "a", "2b", "c"], 1293 ["bc", "", "", "bc"], 1294 ["bc", "c", "3", "bc"]] 1295 ); 1296 testSummarizer(["unittest-mis-12", "-H", "--group-by", "1,2", "--values", "3,1", "-r", "NA"], 1297 file1, 1298 [["fld1", "fld2", "fld3_values", "fld1_values"], 1299 ["a", "a", "3", "a"], 1300 ["c", "a", "2b", "c"], 1301 ["c", "bc", "NA|3", "c|c"], 1302 ["a", "c", "2b", "a"], 1303 ["", "bc", "NA", "NA"]] 1304 ); 1305 testSummarizer(["unittest-mis-13", "-H", "--group-by", "3,2", "--values", "1", "-r", "NA"], 1306 file1, 1307 [["fld3", "fld2", "fld1_values"], 1308 ["3", "a", "a"], 1309 ["2b", "a", "c"], 1310 ["", "bc", "c|NA"], 1311 ["2b", "c", "a"], 1312 ["3", "bc", "c"]] 1313 ); 1314 testSummarizer(["unittest-mis-14", "-H", "--group-by", "2,1,3", "--values", "2", "-r", "NA"], 1315 file1, 1316 [["fld2", "fld1", "fld3", "fld2_values"], 1317 ["a", "a", "3", "a"], 1318 ["a", "c", "2b", "a"], 1319 ["bc", "c", "", "bc"], 1320 ["c", "a", "2b", "c"], 1321 ["bc", "", "", "bc"], 1322 ["bc", "c", "3", "bc"]] 1323 ); 1324 1325 /* Validate that the no-key summarizer works with testSummarizer helper function. 1326 */ 1327 testSummarizer(["unittest-nk-1", "-H", "--values", "1,2"], 1328 file1, 1329 [["fld1_values", "fld2_values"], 1330 ["a|c|c|a||c", "a|a|bc|c|bc|bc"]] 1331 ); 1332 1333 /* Header variations: no header line; auto-generated header line; custom headers. 1334 */ 1335 testSummarizer(["unittest-hdr-1", "--group-by", "1", "--values", "1"], 1336 file1[1..$], 1337 [["a", "a|a"], 1338 ["c", "c|c|c"], 1339 ["", ""]] 1340 ); 1341 testSummarizer(["unittest-hdr-2", "--group-by", "1,2", "--values", "2"], 1342 file1[1..$], 1343 [["a", "a", "a"], 1344 ["c", "a", "a"], 1345 ["c", "bc", "bc|bc"], 1346 ["a", "c", "c"], 1347 ["", "bc", "bc"]] 1348 ); 1349 testSummarizer(["unittest-hdr-3", "--write-header", "--group-by", "2", "--values", "1"], 1350 file1[1..$], 1351 [["field2", "field1_values"], 1352 ["a", "a|c"], 1353 ["bc", "c||c"], 1354 ["c", "a"]] 1355 ); 1356 testSummarizer(["unittest-hdr-4", "-w", "--group-by", "3,2", "--values", "1"], 1357 file1[1..$], 1358 [["field3", "field2", "field1_values"], 1359 ["3", "a", "a"], 1360 ["2b", "a", "c"], 1361 ["", "bc", "c|"], 1362 ["2b", "c", "a"], 1363 ["3", "bc", "c"]] 1364 ); 1365 testSummarizer(["unittest-hdr-5", "-H", "--group-by", "2", "--values", "3:Field3Values"], 1366 file1, 1367 [["fld2", "Field3Values"], 1368 ["a", "3|2b"], 1369 ["bc", "||3"], 1370 ["c", "2b"]] 1371 ); 1372 testSummarizer(["unittest-hdr-6", "-H", "--group-by", "1,2", "--values", "3:FieldThreeValues", "--values", "1:FieldOneValues"], 1373 file1, 1374 [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"], 1375 ["a", "a", "3", "a"], 1376 ["c", "a", "2b", "c"], 1377 ["c", "bc", "|3", "c|c"], 1378 ["a", "c", "2b", "a"], 1379 ["", "bc", "", ""]] 1380 ); 1381 testSummarizer(["unittest-hdr-7", "--write-header", "--group-by", "1", "--values", "3:f3_vals","--values", "2:f2_vals", "--values", "1:f1_vals"], 1382 file1[1..$], 1383 [["field1", "f3_vals", "f2_vals", "f1_vals"], 1384 ["a", "3|2b", "a|c", "a|a"], 1385 ["c", "2b||3", "a|bc|bc", "c|c|c"], 1386 ["", "", "bc", ""]] 1387 ); 1388 testSummarizer(["unittest-hdr-8", "--write-header", "--group-by", "1,3,2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2"], 1389 file1[1..$], 1390 [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"], 1391 ["a", "3", "a", "3", "a", "a"], 1392 ["c", "2b", "a", "2b", "c", "a"], 1393 ["c", "", "bc", "", "c", "bc"], 1394 ["a", "2b", "c", "2b", "a", "c"], 1395 ["", "", "bc", "", "", "bc"], 1396 ["c", "3", "bc", "3", "c", "bc"]] 1397 ); 1398 testSummarizer(["unittest-hdr-9", "--write-header", "--group-by", "1,3-2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2"], 1399 file1[1..$], 1400 [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"], 1401 ["a", "3", "a", "3", "a", "a"], 1402 ["c", "2b", "a", "2b", "c", "a"], 1403 ["c", "", "bc", "", "c", "bc"], 1404 ["a", "2b", "c", "2b", "a", "c"], 1405 ["", "", "bc", "", "", "bc"], 1406 ["c", "3", "bc", "3", "c", "bc"]] 1407 ); 1408 1409 /* Alternate file widths and lengths. 1410 */ 1411 1412 auto file3x2 = [["fld1", "fld2", "fld3"], 1413 ["a", "b", "c"], 1414 ["c", "b", "a"]]; 1415 1416 testSummarizer(["unittest-3x2-1", "-H", "--group-by", "1", "--values", "3"], 1417 file3x2, 1418 [["fld1", "fld3_values"], 1419 ["a", "c"], 1420 ["c", "a"]] 1421 ); 1422 testSummarizer(["unittest-3x2-2", "-H", "--group-by", "2", "--values", "3"], 1423 file3x2, 1424 [["fld2", "fld3_values"], 1425 ["b", "c|a"]] 1426 ); 1427 testSummarizer(["unittest-3x2-3", "-H", "--group-by", "2,1", "--values", "3"], 1428 file3x2, 1429 [["fld2", "fld1", "fld3_values"], 1430 ["b", "a", "c"], 1431 ["b", "c", "a"]] 1432 ); 1433 1434 auto file3x1 = [["fld1", "fld2", "fld3"], 1435 ["a", "b", "c"]]; 1436 1437 testSummarizer(["unittest-3x1-1", "-H", "--group-by", "1", "--values", "3"], 1438 file3x1, 1439 [["fld1", "fld3_values"], 1440 ["a", "c"]] 1441 ); 1442 testSummarizer(["unittest-3x1-2", "--group-by", "1", "--values", "3"], 1443 file3x1[1..$], 1444 [["a", "c"]] 1445 ); 1446 testSummarizer(["unittest-3x1-3", "-H", "--group-by", "2,1", "--values", "3"], 1447 file3x1, 1448 [["fld2", "fld1", "fld3_values"], 1449 ["b", "a", "c"]] 1450 ); 1451 testSummarizer(["unittest-3x1-4", "--group-by", "2,1", "--values", "3"], 1452 file3x1[1..$], 1453 [["b", "a", "c"]] 1454 ); 1455 1456 auto file3x0 = [["fld1", "fld2", "fld3"]]; 1457 1458 testSummarizer(["unittest-3x0-1", "-H", "--group-by", "1", "--values", "3"], 1459 file3x0, 1460 [["fld1", "fld3_values"]] 1461 ); 1462 testSummarizer(["unittest-3x0-2", "--group-by", "1", "--values", "3"], 1463 file3x0[1..$], 1464 [] 1465 ); 1466 testSummarizer(["unittest-3x0-3", "--write-header", "--group-by", "1", "--values", "3"], 1467 file3x0[1..$], 1468 [["field1", "field3_values"]] 1469 ); 1470 1471 1472 testSummarizer(["unittest-3x0-4", "-H", "--group-by", "2,1", "--values", "3"], 1473 file3x0, 1474 [["fld2", "fld1", "fld3_values"]] 1475 ); 1476 1477 testSummarizer(["unittest-3x0-5", "--group-by", "2,1", "--values", "3"], 1478 file3x0[1..$], 1479 [] 1480 ); 1481 1482 testSummarizer(["unittest-3x0-6", "--write-header", "--group-by", "2,1", "--values", "3"], 1483 file3x0[1..$], 1484 [["field2", "field1", "field3_values"]] 1485 ); 1486 1487 auto file2x1 = [["fld1", "fld2"], 1488 ["a", "b"]]; 1489 1490 testSummarizer(["unittest-2x1-1", "-H", "--group-by", "1", "--values", "2"], 1491 file2x1, 1492 [["fld1", "fld2_values"], 1493 ["a", "b"]] 1494 ); 1495 testSummarizer(["unittest-2x1-2", "-H", "--group-by", "2,1", "--values", "1"], 1496 file2x1, 1497 [["fld2", "fld1", "fld1_values"], 1498 ["b", "a", "a"]] 1499 ); 1500 1501 auto file2x0 = [["fld1", "fld2"]]; 1502 1503 testSummarizer(["unittest-2x0-1", "-H", "--group-by", "1", "--values", "2"], 1504 file2x0, 1505 [["fld1", "fld2_values"]] 1506 ); 1507 testSummarizer(["unittest-2x0-2", "-H", "--group-by", "2,1", "--values", "1"], 1508 file2x0, 1509 [["fld2", "fld1", "fld1_values"]] 1510 ); 1511 1512 auto file1x2 = [["fld1"], 1513 ["a"], 1514 [""]]; 1515 1516 testSummarizer(["unittest-1x2-1", "-H", "--group-by", "1", "--values", "1"], 1517 file1x2, 1518 [["fld1", "fld1_values"], 1519 ["a", "a"], 1520 ["", ""]] 1521 ); 1522 1523 auto file1x2b = [["fld1"], 1524 [""], 1525 [""]]; 1526 1527 testSummarizer(["unittest-1x2b-2", "-H", "--group-by", "1", "--values", "1"], 1528 file1x2b, 1529 [["fld1", "fld1_values"], 1530 ["", "|"]] 1531 ); 1532 1533 auto file1x1 = [["fld1"], 1534 ["x"]]; 1535 1536 testSummarizer(["unittest-1x1-1", "-H", "--group-by", "1", "--values", "1"], 1537 file1x1, 1538 [["fld1", "fld1_values"], 1539 ["x", "x"]] 1540 ); 1541 1542 testSummarizer(["unittest-1x1-2", "--group-by", "1", "--values", "1"], 1543 file1x1[1..$], 1544 [["x", "x"]] 1545 ); 1546 1547 testSummarizer(["unittest-1x1-3", "--write-header", "--group-by", "1", "--values", "1"], 1548 file1x1[1..$], 1549 [["field1", "field1_values"], 1550 ["x", "x"]] 1551 ); 1552 1553 auto file1x1b = [["fld1"], 1554 [""]]; 1555 1556 testSummarizer(["unittest-1x1b-1", "-H", "--group-by", "1", "--values", "1"], 1557 file1x1b, 1558 [["fld1", "fld1_values"], 1559 ["", ""]] 1560 ); 1561 1562 auto file1x0 = [["fld1"]]; 1563 1564 testSummarizer(["unittest-1x0-1", "-H", "--group-by", "1", "--values", "1"], 1565 file1x0, 1566 [["fld1", "fld1_values"]] 1567 ); 1568 1569 testSummarizer(["unittest-1x0-2", "--group-by", "1", "--values", "1"], 1570 file1x0[1..$], 1571 [] 1572 ); 1573 1574 testSummarizer(["unittest-1x0-3", "--write-header", "--group-by", "1", "--values", "1"], 1575 file1x0[1..$], 1576 [["field1", "field1_values"]] 1577 ); 1578 1579 /* Alternate delimiters. */ 1580 testSummarizer(["unittest-delim-1", "-H", "--values", "1,2", "--delimiter", "%"], 1581 file1, 1582 [["fld1_values", "fld2_values"], 1583 ["a|c|c|a||c", "a|a|bc|c|bc|bc"]] 1584 ); 1585 testSummarizer(["unittest-delim-2", "-H", "--values", "1-2", "--values-delimiter", "$"], 1586 file1, 1587 [["fld1_values", "fld2_values"], 1588 ["a$c$c$a$$c", "a$a$bc$c$bc$bc"]] 1589 ); 1590 testSummarizer(["unittest-delim-3", "-H", "--values", "1,2", "--delimiter", "#", "--values-delimiter", ","], 1591 file1, 1592 [["fld1_values", "fld2_values"], 1593 ["a,c,c,a,,c", "a,a,bc,c,bc,bc"]] 1594 ); 1595 testSummarizer(["unittest-delim-4", "--write-header", "--group-by", "2", "--values", "1", 1596 "--delimiter", "^", "--values-delimiter", ":"], 1597 file1[1..$], 1598 [["field2", "field1_values"], 1599 ["a", "a:c"], 1600 ["bc", "c::c"], 1601 ["c", "a"]] 1602 ); 1603 testSummarizer(["unittest-delim-5", "--group-by", "1,2", "--values", "2", "--delimiter", "/", 1604 "--values-delimiter", "\\"], 1605 file1[1..$], 1606 [["a", "a", "a"], 1607 ["c", "a", "a"], 1608 ["c", "bc", "bc\\bc"], 1609 ["a", "c", "c"], 1610 ["", "bc", "bc"]] 1611 ); 1612 } 1613 1614 /* Summary Operators and Calculators 1615 * 1616 * Two types of objects are used in implementation: Operators and Calculators. An Operator 1617 * represents a summary calculation specified on the command line, e.g. '--mean 5'. A 1618 * Calculator is used to manage the summary calculation for each unique key in the input. 1619 * 1620 * As an example, consider the command: 1621 * 1622 * $tsv-summarize --group-by 1 --mean 3 --mean 5 1623 * 1624 * This command will create two instances of a MeanOperator, one each for fields 3 and 5. 1625 * They produce the output field headers (e.g. "field3_mean", "field5_mean"). They also 1626 * create MeanCalculator objects for each unique value in field 1. For 'mean', a 1627 * calculator needs to track occurrence count and sum. Calculators produce the final 1628 * value when all processing is finished. 1629 * 1630 * Summary field headers 1631 * 1632 * There are several options for specifying summary field headers. The defaults combine the 1633 * operator name and the header of the field summarized. The defaults can be overridden on 1634 * on the command line. These scenarios are supported via the operator constructor and the 1635 * processHeaderLine() method. 1636 * 1637 * Missing field policy 1638 * 1639 * At present, tsv-summarize has a single policy for handling missing values that applies 1640 * to all operators. However, it is logically operator specific and is implemented that 1641 * way. The MissingFieldPolicy struct describes the policy, each operator contains one. 1642 * Calculators access thier operator's policy struct. 1643 */ 1644 1645 /** An Operator represents a summary calculation specified on the command line. 1646 * e.g. '--mean 5'. 1647 */ 1648 interface Operator 1649 { 1650 @property string header(); 1651 @property string name(); 1652 void processHeaderLine(const char[][] fields); 1653 size_t[] numericFieldsToSave(); // Numeric fields this Operator needs saved 1654 size_t[] textFieldsToSave(); // Text fields this Operator needs saved 1655 Calculator makeCalculator(); 1656 } 1657 1658 /** Calculators are responsible for the calculation of a single computation. They 1659 * process each line and produce the final value when all processing is finished. 1660 */ 1661 interface Calculator 1662 { 1663 void processNextLine(const char[][] fields); 1664 string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions); 1665 } 1666 1667 /** This class describes processing behavior when a missing value is encountered. 1668 */ 1669 final class MissingFieldPolicy 1670 { 1671 private bool _useMissing = true; // True if missing values are processed unchanged. 1672 private bool _replaceMissing = false; // True if missing values are replaced. 1673 private string _missingReplacement; // Replacement string if replaceMissing is true. 1674 1675 this (in bool excludeMissing = false, in string missingReplacement = "") 1676 { 1677 updatePolicy(excludeMissing, missingReplacement); 1678 } 1679 1680 void updatePolicy(in bool excludeMissing, in string missingReplacement) 1681 { 1682 _missingReplacement = missingReplacement; 1683 _replaceMissing = missingReplacement.length != 0; 1684 _useMissing = !excludeMissing && !replaceMissing; 1685 } 1686 1687 final bool isMissingField(const char[] field) const 1688 { 1689 return field.length == 0; 1690 } 1691 1692 final bool useMissing() const @property 1693 { 1694 return _useMissing; 1695 } 1696 1697 final bool excludeMissing() const @property 1698 { 1699 return !_useMissing && !_replaceMissing; 1700 } 1701 1702 final bool replaceMissing() const @property 1703 { 1704 return _replaceMissing; 1705 } 1706 1707 final string missingReplacement() const @property 1708 { 1709 return _missingReplacement; 1710 } 1711 } 1712 1713 /* The SharedFieldValues and UniqueKeyValuesLists classes manage lists of values collected 1714 * while reading data. Operations like median collect all values and operate on them when 1715 * running the final calculation. Value lists are needed for each unique key. A command 1716 * using multiple Operators may save multiple fields. And, different Operators may be run 1717 * against the same field. 1718 * 1719 * The last part motivates these classes. Handling large data sets necessitates minimizing 1720 * in-memory storage, making it desirable to share identical lists between Calculators. 1721 * Otherwise, each Calculator could implement its own storage, which would be simpler. 1722 * 1723 * The setup works as follows: 1724 * - Operators advertise fields they need saved ([text|numeric]FieldsToSave methods). 1725 * - The SummarizerBase object keeps a SharedFieldValues object, which in turn keeps list 1726 * of the fields advertised by Operators as needing sharing. This list gets created 1727 * during command initialization (SummarizerBase.setOperators). 1728 * - The SharedFieldValues object is used to create a UniqueKeyValuesLists object every 1729 * time a new unique key is found, in parellel to the Calculator objects created for the 1730 * key. The UniqueKeyValuesLists objects are managed by derived Summarizer classes. 1731 * - A unique key's UniqueKeyValuesLists object is passed each input line, same as 1732 * Calculators, saving the values. 1733 * - Calculators retrieve the saved values during the calculation phase. The calculator's 1734 * ProcessNextField method is typically a no-op. 1735 * - Calculators cannot make assumptions about the order of the saved values. This is 1736 * pragmatic concession to median and quantile calculations, which need to sort the data, 1737 * at least partially. Rather than generate sorted copies, the current algorithms 1738 * sort the data in place. 1739 * 1740 * One concession to duplicate storage is that text and numeric versions of the same 1741 * field might be stored. The reason is because it's important to convert text to numbers 1742 * as they are read so that useful error messages can be generated. And, storing both 1743 * forms of the same field should be less common. 1744 * 1745 * The current implementation uses the same missing values policy for all fields. If 1746 * multiple policies become supported this will need to change. 1747 * 1748 * Built-in calculations - UniqueKeyValueLists have a built-in median operation. This is 1749 * to avoid repeated calculations of the median by different calculations. 1750 */ 1751 1752 final class SharedFieldValues 1753 { 1754 // Arrays with field indices that need to be saved. 1755 private size_t[] _numericFieldIndices; 1756 private size_t[] _textFieldIndices; 1757 1758 /* Called during summarizer setup to add a shared field value for a specific field index. 1759 * eg. '--median 7' will add end up calling addNumericIdex(6), 6 being the zero-based index. 1760 * A specific index is only added once. 1761 */ 1762 final void addNumericIndex (size_t index) 1763 { 1764 if (!canFind(_numericFieldIndices, index)) _numericFieldIndices ~= index; 1765 } 1766 1767 /* Similar to addNumericIndex, except adds a text index. */ 1768 final void addTextIndex (size_t index) 1769 { 1770 if (!canFind(_textFieldIndices, index)) _textFieldIndices ~= index; 1771 } 1772 1773 /* Called every time a new key is found, or once at the beginning of the program if no keys 1774 * are being used (entire column summarized). 1775 */ 1776 final UniqueKeyValuesLists makeUniqueKeyValuesLists() 1777 { 1778 return new UniqueKeyValuesLists(_numericFieldIndices, _textFieldIndices); 1779 } 1780 } 1781 1782 final class UniqueKeyValuesLists 1783 { 1784 /* A FieldValues object holds is a list of values collect for a specific field. A 1785 * unique key may hold several. For example, the command: 1786 * $ tsv-summarize --k 1 --median 4 -- median 5 1787 * requires keeping lists for both fields 4 and 5. This in turn will result in a 1788 * _numericFieldValues being a 2 element array, one with a list of field 4 values, 1789 * the second of field 5 values. Linear search is used to find a specific field. 1790 */ 1791 private FieldValues!double[] _numericFieldValues; 1792 private FieldValues!string[] _textFieldValues; 1793 private double[] _numericFieldMedians; 1794 1795 /* The UniqueKeyValuesLists constructor takes arrays of field indices to be saved. */ 1796 this(const size_t[] numericFieldIndices, const size_t[] textFieldIndices) 1797 { 1798 if (numericFieldIndices.length > 0) 1799 { 1800 _numericFieldValues = new FieldValues!double[](numericFieldIndices.length); 1801 foreach (i, fieldIndex; numericFieldIndices) 1802 _numericFieldValues[i] = new FieldValues!double(fieldIndex); 1803 } 1804 1805 if (textFieldIndices.length > 0) 1806 { 1807 _textFieldValues = new FieldValues!string[](textFieldIndices.length); 1808 foreach (i, fieldIndex; textFieldIndices) 1809 _textFieldValues[i] = new FieldValues!string(fieldIndex); 1810 } 1811 } 1812 1813 void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy) 1814 { 1815 _numericFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy)); 1816 _textFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy)); 1817 } 1818 1819 private FieldValues!double findNumericFieldValues(size_t index) 1820 { 1821 alias pred = (FieldValues!double a, size_t b) => (a.fieldIndex == b); 1822 auto r = find!pred(_numericFieldValues, index); 1823 assert(!r.empty); 1824 return r.front; 1825 } 1826 1827 private FieldValues!string findTextFieldValues(size_t index) 1828 { 1829 alias pred = (FieldValues!string a, size_t b) => (a.fieldIndex == b); 1830 auto r = find!pred(_textFieldValues, index); 1831 assert(!r.empty); 1832 return r.front; 1833 } 1834 1835 final double[] numericValues(size_t index) 1836 { 1837 return findNumericFieldValues(index).getArray; 1838 } 1839 1840 final double[] numericValuesSorted(size_t index) 1841 { 1842 return findNumericFieldValues(index).getSortedArray; 1843 } 1844 1845 final string[] textValues(size_t index) 1846 { 1847 return findTextFieldValues(index).getArray; 1848 } 1849 1850 final string[] textValuesSorted(size_t index) 1851 { 1852 return findTextFieldValues(index).getSortedArray; 1853 } 1854 1855 final double numericValuesMedian(size_t index) 1856 { 1857 return findNumericFieldValues(index).median; 1858 } 1859 1860 private final class FieldValues(ValueType) 1861 { 1862 import std.array : appender; 1863 private size_t _fieldIndex; 1864 private Appender!(ValueType[]) _values; 1865 private bool _haveMedian = false; 1866 private bool _isSorted = false; 1867 private ValueType _medianValue; 1868 1869 this(size_t fieldIndex) 1870 { 1871 _fieldIndex = fieldIndex; 1872 } 1873 1874 final size_t length() const @property 1875 { 1876 return _values.data.length; 1877 } 1878 1879 final size_t fieldIndex() const @property 1880 { 1881 return _fieldIndex; 1882 } 1883 1884 final void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy) 1885 { 1886 debug writefln("[%s]: %s", __FUNCTION__, fields.to!string); 1887 1888 const char[] field = fields[_fieldIndex]; 1889 if (missingPolicy.useMissing || !missingPolicy.isMissingField(field)) 1890 { 1891 _values.put(field.to!ValueType); 1892 _haveMedian = false; 1893 _isSorted = false; 1894 } 1895 else if (missingPolicy.replaceMissing) 1896 { 1897 _values.put(missingPolicy.missingReplacement.to!ValueType); 1898 _haveMedian = false; 1899 _isSorted = false; 1900 } 1901 } 1902 1903 /* Return an input range of the values. */ 1904 final auto values() 1905 { 1906 return _values.data; 1907 } 1908 1909 final ValueType[] getArray() 1910 { 1911 return _values.data; 1912 } 1913 1914 final ValueType[] getSortedArray() 1915 { 1916 if (!_isSorted) 1917 { 1918 import std.algorithm : sort; 1919 sort(_values.data); 1920 _isSorted = true; 1921 } 1922 return _values.data; 1923 } 1924 1925 final ValueType median() 1926 { 1927 if (!_haveMedian) 1928 { 1929 import tsv_utils.common.numerics : rangeMedian; 1930 _medianValue = _values.data.rangeMedian(); 1931 _haveMedian = true; 1932 } 1933 1934 return _medianValue; 1935 } 1936 } 1937 } 1938 1939 /** SingleFieldOperator is a base class for single field operators, the most common 1940 * Operator. Derived classes implement makeCalculator and the Calculator class it returns. 1941 */ 1942 class SingleFieldOperator : Operator 1943 { 1944 import std.typecons : Flag; 1945 1946 private string _name; 1947 private string _header; 1948 private size_t _fieldIndex; 1949 private bool _useHeaderSuffix; 1950 private bool _allowCustomHeader; 1951 private bool _hasCustomHeader = false; 1952 private size_t[] _numericFieldsToSave; 1953 private size_t[] _textFieldsToSave; 1954 private MissingFieldPolicy _missingPolicy; 1955 1956 this(string operatorName, size_t fieldIndex, MissingFieldPolicy missingPolicy, 1957 Flag!"useHeaderSuffix" useHeaderSuffix = Yes.useHeaderSuffix, 1958 Flag!"allowCustomHeader" allowCustomHeader = Yes.allowCustomHeader) 1959 { 1960 _name = operatorName; 1961 _fieldIndex = fieldIndex; 1962 _missingPolicy = missingPolicy; 1963 _useHeaderSuffix = useHeaderSuffix; 1964 _allowCustomHeader = allowCustomHeader; 1965 // Default header. May be overrridden by custom header or header line. 1966 _header = 1967 fieldHeaderFromIndex(fieldIndex) 1968 .summaryHeaderFromFieldHeader(_useHeaderSuffix ? operatorName : ""); 1969 } 1970 1971 void setCustomHeader (string customHeader) 1972 { 1973 assert(_allowCustomHeader); 1974 _header = customHeader; 1975 _hasCustomHeader = true; 1976 } 1977 1978 final string name() const @property 1979 { 1980 return _name; 1981 } 1982 1983 final bool allowCustomHeader() const @property 1984 { 1985 return _allowCustomHeader; 1986 } 1987 1988 /* saveFieldValues[Numeric|Text] are called by derived classes to indicate that field 1989 * that the field values should be saved. These should called during construction. 1990 */ 1991 final void setSaveFieldValuesNumeric() 1992 { 1993 _numericFieldsToSave ~= _fieldIndex; 1994 } 1995 1996 final void setSaveFieldValuesText() 1997 { 1998 _textFieldsToSave ~= _fieldIndex; 1999 } 2000 2001 final MissingFieldPolicy missingPolicy() @property 2002 { 2003 return _missingPolicy; 2004 } 2005 2006 final size_t fieldIndex() const @property 2007 { 2008 return _fieldIndex; 2009 } 2010 2011 final string header() const @property 2012 { 2013 return _header; 2014 } 2015 2016 final bool useHeaderSuffix() const @property 2017 { 2018 return _useHeaderSuffix; 2019 } 2020 2021 void processHeaderLine(const char[][] fields) 2022 { 2023 if (!_hasCustomHeader) { 2024 debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string); 2025 _header = summaryHeaderFromFieldHeader(fields[_fieldIndex].to!string, 2026 _useHeaderSuffix ? _name : ""); 2027 } 2028 } 2029 2030 final size_t[] numericFieldsToSave() 2031 { 2032 return _numericFieldsToSave; 2033 } 2034 2035 final size_t[] textFieldsToSave() 2036 { 2037 return _textFieldsToSave; 2038 } 2039 2040 abstract SingleFieldCalculator makeCalculator(); 2041 } 2042 2043 /** SingleFieldCalculator is a base class for the common case of calculators using a single 2044 * field. Derived classes implement processNextField() rather than processNextLine(). 2045 */ 2046 class SingleFieldCalculator : Calculator 2047 { 2048 private size_t _fieldIndex; 2049 2050 this(size_t fieldIndex) 2051 { 2052 _fieldIndex = fieldIndex; 2053 } 2054 2055 final size_t fieldIndex() const @property 2056 { 2057 return _fieldIndex; 2058 } 2059 2060 final void processNextLine(const char[][] fields) 2061 { 2062 debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string); 2063 2064 auto missingPolicy = getOperator.missingPolicy; 2065 const char[] field = fields[_fieldIndex]; 2066 2067 if (missingPolicy.useMissing || !missingPolicy.isMissingField(field)) 2068 { 2069 processNextField(field); 2070 } 2071 else if (missingPolicy.replaceMissing) 2072 { 2073 processNextField(missingPolicy.missingReplacement); 2074 } 2075 } 2076 2077 abstract SingleFieldOperator getOperator(); 2078 2079 abstract void processNextField(const char[] field); 2080 } 2081 2082 /* Unittest helper functions. Only compiled when -unittest is in effect. */ 2083 version(unittest) 2084 { 2085 /** A helper for SingleFieldOperator unit tests. 2086 * 2087 * testSingleFieldOperator takes a set of split file values, a field index, a header 2088 * suffix, and a set of expected values. The expected values array contains the 2089 * initial value (zero entries) and the expected values after each line. (One more 2090 * expected value than input lines.) The zero entry case is what is generated for an 2091 * empty file. An example testing the 'min' operator against a file with 2 columns, 2092 * 3 rows, using field index 1: 2093 * 2094 * testSingleFieldOperator!MinOperator( 2095 * [["10", "100"], // The split file. 3 lines by 2 rows. 2096 * ["5", "50"], 2097 * ["20", "200"]], 2098 * 1, // Field index (zero-based, so "100", "50", "200") 2099 * "min", // The header suffix, normally the operator name. 2100 * ["nan", "100", "50", "50"]); // Min value after processing each line. 2101 * 2102 * A typical operator unit test uses three "files", one each of 1x3, 2x3, and 3x3. 2103 * Then run the operator is tested against each column, a total of six calls. Headers 2104 * are automatically checked. Additional entries can be used to extend coverage. 2105 * 2106 * A non-default MissingFieldPolicy can be provide as an optional last argument. 2107 * Operator tests should include exclusion and replacement variations. See operator 2108 * unit tests for details. 2109 * 2110 * The testSingleFieldOperatorBase adds an additional capability - Custom operator 2111 * init arguments. Currently this is used only by the quantile operator. 2112 * 2113 * These tests do not check unique key behavior (group-by). Operators don't have info 2114 * about unique keys, and interact with them only indirectly, via Calculators. 2115 */ 2116 void testSingleFieldOperator(OperatorClass : SingleFieldOperator) 2117 (const char[][][] splitFile, size_t fieldIndex, string headerSuffix, 2118 const char[][] expectedValues, 2119 MissingFieldPolicy missingPolicy = new MissingFieldPolicy) 2120 { 2121 testSingleFieldOperatorBase!OperatorClass(splitFile, fieldIndex, headerSuffix, expectedValues, missingPolicy); 2122 } 2123 2124 void testSingleFieldOperatorBase(OperatorClass : SingleFieldOperator, T...) 2125 (const char[][][] splitFile, size_t fieldIndex, string headerSuffix, 2126 const char[][] expectedValues, 2127 MissingFieldPolicy missingPolicy, 2128 T extraOpInitArgs) 2129 { 2130 import std.format : format; 2131 import std.array : appender; 2132 import std..string : chomp; 2133 import std.traits : EnumMembers; 2134 2135 auto numFields = (splitFile[0]).length; 2136 2137 assert(fieldIndex < numFields, 2138 format("[testSingleFieldOperator] Invalid field index. headerSuffix: %s", 2139 headerSuffix)); 2140 assert(splitFile.length + 1 == expectedValues.length, 2141 format("[testSingleFieldOperator] Need one more expected value than number of rows. headerSuffix: %s", 2142 headerSuffix)); 2143 2144 /* printOptions - Only the 'values-delimiter' (2nd arg) is used these tests. */ 2145 auto printOptions = SummarizerPrintOptions('#', '|'); 2146 2147 /* An input header line. */ 2148 string[] inputHeaderLine = new string[numFields]; 2149 foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string; 2150 2151 /* The different expected output field headers. */ 2152 auto outputFieldHeaderWithNoHeaderLine = 2153 fieldHeaderFromIndex(fieldIndex) 2154 .summaryHeaderFromFieldHeader(headerSuffix); 2155 auto outputFieldHeaderFromHeaderLine = 2156 inputHeaderLine[fieldIndex] 2157 .summaryHeaderFromFieldHeader(headerSuffix); 2158 auto customOutputFieldHeader = "custom"; 2159 2160 enum HeaderUsecase { 2161 HeaderLine_DefaultHeader, 2162 HeaderLine_CustomHeader, 2163 NoHeaderLine_DefaultHeader, 2164 NoHeaderLine_CustomHeader, 2165 NoHeaderLine_NoOutputHeader, 2166 } 2167 2168 string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected) 2169 { 2170 return format("[testSingleFieldOperator] Unexpected header. Operator: %s; Usecase: %s; Actual: '%s'; Expected: '%s'", 2171 op.name, hc, actual, expected); 2172 } 2173 2174 string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex, size_t fieldIndex, 2175 const char[] actual, const char[] expected) 2176 { 2177 return format("[testSingleFieldOperator] Unexpected value. Operator: %s; Usecase: %s; RowIndex: %d, FieldIndex: %d\n Actual: '%s'; Expected: '%s'", 2178 op.name, hc, rowIndex, fieldIndex, actual, expected); 2179 } 2180 2181 /* Run the logic for each header use case. */ 2182 foreach (hc; EnumMembers!HeaderUsecase) 2183 { 2184 bool hasInputHeader = ( 2185 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2186 hc == HeaderUsecase.HeaderLine_CustomHeader 2187 ); 2188 bool hasOutputHeader = ( 2189 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2190 hc == HeaderUsecase.HeaderLine_CustomHeader || 2191 hc == HeaderUsecase.NoHeaderLine_DefaultHeader || 2192 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2193 ); 2194 bool hasCustomHeader = ( 2195 hc == HeaderUsecase.HeaderLine_CustomHeader || 2196 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2197 ); 2198 2199 if (hasCustomHeader) assert(hasOutputHeader); 2200 2201 auto op = new OperatorClass(fieldIndex, missingPolicy, extraOpInitArgs); 2202 2203 if (hasCustomHeader) 2204 { 2205 if (!op.allowCustomHeader) continue; // Custom header not support by this operator 2206 op.setCustomHeader(customOutputFieldHeader); 2207 } 2208 2209 Operator[] operatorArray; 2210 operatorArray ~= op; 2211 2212 auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy); 2213 summarizer.setOperators(inputRangeObject(operatorArray)); 2214 2215 if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine); 2216 2217 if (hasOutputHeader) 2218 { 2219 /* Write the header line. Note that this is a one-field header, */ 2220 auto headerLineOutput = appender!(char[])(); 2221 summarizer.writeSummaryHeader(headerLineOutput, printOptions); 2222 2223 /* Test that the header was generated correctly. 2224 * 2225 * Note: Because the output is generated by a Summarizer, it will have a 2226 * trailing newline. Use chomp to trim it. 2227 */ 2228 final switch (hc) 2229 { 2230 case HeaderUsecase.HeaderLine_DefaultHeader: 2231 assert(headerLineOutput.data.chomp == outputFieldHeaderFromHeaderLine, 2232 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2233 outputFieldHeaderFromHeaderLine)); 2234 break; 2235 case HeaderUsecase.NoHeaderLine_DefaultHeader: 2236 assert(headerLineOutput.data.chomp == outputFieldHeaderWithNoHeaderLine, 2237 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2238 outputFieldHeaderWithNoHeaderLine)); 2239 break; 2240 case HeaderUsecase.HeaderLine_CustomHeader: 2241 case HeaderUsecase.NoHeaderLine_CustomHeader: 2242 assert(headerLineOutput.data.chomp == customOutputFieldHeader, 2243 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2244 customOutputFieldHeader)); 2245 break; 2246 case HeaderUsecase.NoHeaderLine_NoOutputHeader: 2247 break; 2248 } 2249 2250 } 2251 2252 /* For each line, process the line, generate the output, and test that the 2253 * value is correct. Start with the empty file case. 2254 */ 2255 foreach (i, const char[] expected; expectedValues) 2256 { 2257 if (i > 0) summarizer.processNextLine(splitFile[i - 1]); 2258 auto summaryLineOutput = appender!(char[])(); 2259 summarizer.writeSummaryBody(summaryLineOutput, printOptions); 2260 assert(summaryLineOutput.data.chomp == expected, 2261 valueAssertMessage(operatorArray[0], hc, i, fieldIndex, 2262 summaryLineOutput.data.chomp, expectedValues[i])); 2263 } 2264 } 2265 } 2266 } 2267 2268 /** ZeroFieldOperator is a base class for operators that take no input. The main use 2269 * case is the CountOperator, which counts the occurrences of each unique key. Other 2270 * uses are possible, for example, weighted random number assignment. 2271 * 2272 * The primary rationale for ZeroFieldOperator and ZeroFieldCalculator is to clarify 2273 * the information available to such a routine. In particular, the split fields passed 2274 * to processHeaderLine and processNextLine don't include all fields in the input, 2275 * something that might not be obvious when implementing an operator. (Only fields 2276 * required by operators acting on specific fields are included.) 2277 */ 2278 class ZeroFieldOperator : Operator 2279 { 2280 import std.typecons : Flag; 2281 2282 private string _name; 2283 private string _header; 2284 2285 this(string operatorName) 2286 { 2287 _name = operatorName; 2288 _header = operatorName; 2289 } 2290 2291 void setCustomHeader (string customHeader) 2292 { 2293 _header = customHeader; 2294 } 2295 2296 bool allowCustomHeader() const @property 2297 { 2298 return true; 2299 } 2300 2301 final string name() const @property 2302 { 2303 return _name; 2304 } 2305 2306 final string header() const @property 2307 { 2308 return _header; 2309 } 2310 2311 /* A no-op. ZeroFieldOperators have no access to the header line. */ 2312 final void processHeaderLine(const char[][] fields) { } 2313 2314 /* A no-op. ZeroFieldOperators have no access to fields. */ 2315 final size_t[] numericFieldsToSave() 2316 { 2317 size_t[] emptyArray; 2318 return emptyArray; 2319 } 2320 2321 /* A no-op. ZeroFieldOperators have no access to fields. */ 2322 final size_t[] textFieldsToSave() 2323 { 2324 size_t[] emptyArray; 2325 return emptyArray; 2326 } 2327 2328 abstract ZeroFieldCalculator makeCalculator(); 2329 } 2330 2331 /** ZeroFieldCalculator is a base class for operators that don't use fields as input. 2332 * In particular, the Count operator. It is a companion to the ZeroFieldOperator class. 2333 * 2334 * Derived classes implement processNextEntry() rather than processNextLine(), and the 2335 * single argument form of calculate() given as an abstract function. 2336 */ 2337 class ZeroFieldCalculator : Calculator 2338 { 2339 this() { } 2340 2341 final void processNextLine(const char[][] fields) 2342 { 2343 debug writefln("[%s]", __FUNCTION__,); 2344 processNextEntry(); 2345 } 2346 2347 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2348 { 2349 return calculate(printOptions); 2350 } 2351 2352 abstract void processNextEntry(); 2353 abstract string calculate(const ref SummarizerPrintOptions printOptions); 2354 } 2355 2356 version(unittest) 2357 { 2358 /* A helper for ZeroFieldOperator unit tests. 2359 * 2360 * testZeroFieldOperator takes a set of split file values, a default header, and a 2361 * set of expected values. The expected values array contains the expected values 2362 * after each line. 2363 * 2364 * testZeroFieldOperator is very similar to testSingleFieldOperator, except that 2365 * there is no use of field indices and fewer types of headers. See the latter's 2366 * documentation and the CountOperator unit tests for examples. 2367 */ 2368 void testZeroFieldOperator(OperatorClass : ZeroFieldOperator) 2369 (const char[][][] splitFile, string defaultHeader, const char[][] expectedValues) 2370 { 2371 import std.format : format; 2372 import std.array : appender; 2373 import std..string : chomp; 2374 import std.traits : EnumMembers; 2375 2376 auto numFields = (splitFile[0]).length; 2377 2378 assert(splitFile.length + 1 == expectedValues.length, 2379 format("[testZeroFieldOperator] Need one more expected value than number of rows. headerSuffix: %s", 2380 defaultHeader)); 2381 2382 /* printOptions - Not used these tests, but needed for API calls. */ 2383 auto printOptions = SummarizerPrintOptions('#', '|'); 2384 2385 /* Missing policy doesn't apply to zero field operators, but need the object for the summarizer. */ 2386 auto missingPolicy = new MissingFieldPolicy; 2387 2388 /* An input header line. */ 2389 string[] inputHeaderLine = new string[numFields]; 2390 foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string; 2391 2392 auto customOutputFieldHeader = "custom"; 2393 2394 enum HeaderUsecase { 2395 HeaderLine_DefaultHeader, 2396 HeaderLine_CustomHeader, 2397 NoHeaderLine_DefaultHeader, 2398 NoHeaderLine_CustomHeader, 2399 NoHeaderLine_NoOutputHeader, 2400 } 2401 2402 string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected) 2403 { 2404 return format("[testZeroFieldOperator] Unexpected header. Operator: %s; Usecase: %s; Actual: '%s'; Expected: '%s'", 2405 op.name, hc, actual, expected); 2406 } 2407 2408 string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex, 2409 const char[] actual, const char[] expected) 2410 { 2411 return format("[testZeroFieldOperator] Unexpected value. Operator: %s; Usecase: %s; RowIndex: %d\n Actual: '%s'; Expected: '%s'", 2412 op.name, hc, rowIndex, actual, expected); 2413 } 2414 2415 /* Run the logic for each header use case. */ 2416 foreach (hc; EnumMembers!HeaderUsecase) 2417 { 2418 bool hasInputHeader = ( 2419 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2420 hc == HeaderUsecase.HeaderLine_CustomHeader 2421 ); 2422 bool hasOutputHeader = ( 2423 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2424 hc == HeaderUsecase.HeaderLine_CustomHeader || 2425 hc == HeaderUsecase.NoHeaderLine_DefaultHeader || 2426 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2427 ); 2428 bool hasCustomHeader = ( 2429 hc == HeaderUsecase.HeaderLine_CustomHeader || 2430 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2431 ); 2432 2433 if (hasCustomHeader) assert(hasOutputHeader); 2434 2435 auto op = new OperatorClass(); 2436 2437 if (hasCustomHeader) 2438 { 2439 if (!op.allowCustomHeader) continue; // Custom header not support by this operator 2440 op.setCustomHeader(customOutputFieldHeader); 2441 } 2442 2443 Operator[] operatorArray; 2444 operatorArray ~= op; 2445 2446 auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy); 2447 summarizer.setOperators(inputRangeObject(operatorArray)); 2448 if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine); 2449 2450 if (hasOutputHeader) 2451 { 2452 /* Write the header line. Note that this is a one-field header, */ 2453 auto headerLineOutput = appender!(char[])(); 2454 summarizer.writeSummaryHeader(headerLineOutput, printOptions); 2455 2456 /* Test that the header was generated correctly. 2457 * 2458 * Note: Because the output is generated by a Summarizer, it will have a 2459 * trailing newline. Use chomp to trim it. 2460 */ 2461 final switch (hc) 2462 { 2463 case HeaderUsecase.HeaderLine_DefaultHeader: 2464 case HeaderUsecase.NoHeaderLine_DefaultHeader: 2465 assert(headerLineOutput.data.chomp == defaultHeader, 2466 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2467 defaultHeader)); 2468 break; 2469 case HeaderUsecase.HeaderLine_CustomHeader: 2470 case HeaderUsecase.NoHeaderLine_CustomHeader: 2471 assert(headerLineOutput.data.chomp == customOutputFieldHeader, 2472 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2473 customOutputFieldHeader)); 2474 break; 2475 case HeaderUsecase.NoHeaderLine_NoOutputHeader: 2476 break; 2477 } 2478 2479 } 2480 2481 /* For each line, process the line, generate the output, and test that the 2482 * value is correct. Start with the empty file case. 2483 */ 2484 foreach (i, const char[] expected; expectedValues) 2485 { 2486 if (i > 0) summarizer.processNextLine(splitFile[i - 1]); 2487 auto summaryLineOutput = appender!(char[])(); 2488 summarizer.writeSummaryBody(summaryLineOutput, printOptions); 2489 assert(summaryLineOutput.data.chomp == expected, 2490 valueAssertMessage(operatorArray[0], hc, i, 2491 summaryLineOutput.data.chomp, expectedValues[i])); 2492 } 2493 } 2494 } 2495 } 2496 2497 /* Specific operators. 2498 * 2499 * Notes: 2500 * - The 'Calculator' inner classes are 'static'. This means inner class instances do not 2501 * keep a reference to the context of the outer class. In exchange, Calculator instances 2502 * need to hold all needed state, typically the field index they are summarizing. 2503 */ 2504 2505 /** CountOperator counts the number of occurrences of each unique key, or the number of 2506 * input lines if there is no unique key. 2507 * 2508 * CountOperator differs from most other operators in that it doesn't summarize a specific 2509 * field on the line. Instead it is summarizing a property of the unique key itself. For 2510 * this reason it doesn't derive from SingleFieldOperator. 2511 */ 2512 final class CountOperator : ZeroFieldOperator 2513 { 2514 this() 2515 { 2516 super("count"); 2517 } 2518 2519 final override ZeroFieldCalculator makeCalculator() 2520 { 2521 return new CountCalculator(); 2522 } 2523 2524 static final class CountCalculator : ZeroFieldCalculator 2525 { 2526 private size_t _count = 0; 2527 2528 final override void processNextEntry() 2529 { 2530 _count++; 2531 } 2532 2533 final override string calculate(const ref SummarizerPrintOptions printOptions) 2534 { 2535 return printOptions.formatNumber(_count); 2536 } 2537 } 2538 } 2539 2540 unittest // CountOperator 2541 { 2542 auto col1File = [["10"], ["9.5"], ["11"]]; 2543 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 2544 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 2545 2546 testZeroFieldOperator!CountOperator(col1File, "count", ["0", "1", "2", "3"]); 2547 testZeroFieldOperator!CountOperator(col2File, "count", ["0", "1", "2", "3"]); 2548 testZeroFieldOperator!CountOperator(col3File, "count", ["0", "1", "2", "3"]); 2549 } 2550 2551 /** RetainOperator retains the first occurrence of a field, without changing the header. 2552 * 2553 * RetainOperator is intended for fields where the value is expected to be the same for 2554 * all occurrences of the unique key, and the goal is to pass the value through unchanged. 2555 * It is like FirstOperator, except that the original header is preserved. The original 2556 * header preservation is setup in the call to the SingleFieldOperation constructor. 2557 * 2558 * Notes: 2559 * - An option to signal an error if multiple values are encountered might be useful. 2560 */ 2561 final class RetainOperator : SingleFieldOperator 2562 { 2563 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2564 { 2565 super("retain", fieldIndex, missingPolicy, No.useHeaderSuffix, No.allowCustomHeader); 2566 } 2567 2568 final override SingleFieldCalculator makeCalculator() 2569 { 2570 return new RetainCalculator(fieldIndex); 2571 } 2572 2573 final class RetainCalculator : SingleFieldCalculator 2574 { 2575 private bool _done = false; 2576 private string _value = ""; 2577 2578 this(size_t fieldIndex) 2579 { 2580 super(fieldIndex); 2581 } 2582 2583 final override RetainOperator getOperator() 2584 { 2585 return this.outer; 2586 } 2587 2588 final override void processNextField(const char[] nextField) 2589 { 2590 if (!_done) 2591 { 2592 _value = nextField.to!string; 2593 _done = true; 2594 } 2595 } 2596 2597 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2598 { 2599 return _value; 2600 } 2601 } 2602 } 2603 2604 unittest // RetainOperator 2605 { 2606 auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]]; 2607 auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]]; 2608 auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]]; 2609 2610 testSingleFieldOperator!RetainOperator(col1File, 0, "", ["", "r1c1", "r1c1", "r1c1"]); 2611 testSingleFieldOperator!RetainOperator(col2File, 0, "", ["", "r1c1", "r1c1", "r1c1"]); 2612 testSingleFieldOperator!RetainOperator(col2File, 1, "", ["", "r1c2", "r1c2", "r1c2"]); 2613 testSingleFieldOperator!RetainOperator(col3File, 0, "", ["", "r1c1", "r1c1", "r1c1"]); 2614 testSingleFieldOperator!RetainOperator(col3File, 1, "", ["", "r1c2", "r1c2", "r1c2"]); 2615 testSingleFieldOperator!RetainOperator(col3File, 2, "", ["", "r1c3", "r1c3", "r1c3"]); 2616 2617 auto col1misFile = [[""], ["r2c1"], ["r3c1"]]; 2618 testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "", "r2c1", "r2c1"], 2619 new MissingFieldPolicy(true, "")); // Exclude missing 2620 testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "NA", "NA", "NA"], 2621 new MissingFieldPolicy(false, "NA")); // Replace missing 2622 } 2623 2624 /** FirstOperator outputs the first value found for the field. 2625 */ 2626 final class FirstOperator : SingleFieldOperator 2627 { 2628 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2629 { 2630 super("first", fieldIndex, missingPolicy); 2631 } 2632 2633 final override SingleFieldCalculator makeCalculator() 2634 { 2635 return new FirstCalculator(fieldIndex); 2636 } 2637 2638 final class FirstCalculator : SingleFieldCalculator 2639 { 2640 private bool _done = false; 2641 private string _value = ""; 2642 2643 this(size_t fieldIndex) 2644 { 2645 super(fieldIndex); 2646 } 2647 2648 final override FirstOperator getOperator() 2649 { 2650 return this.outer; 2651 } 2652 2653 final override void processNextField(const char[] nextField) 2654 { 2655 if (!_done) 2656 { 2657 _value = nextField.to!string; 2658 _done = true; 2659 } 2660 } 2661 2662 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2663 { 2664 return _value; 2665 } 2666 } 2667 } 2668 2669 unittest // FirstOperator 2670 { 2671 auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]]; 2672 auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]]; 2673 auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]]; 2674 2675 testSingleFieldOperator!FirstOperator(col1File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]); 2676 testSingleFieldOperator!FirstOperator(col2File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]); 2677 testSingleFieldOperator!FirstOperator(col2File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]); 2678 testSingleFieldOperator!FirstOperator(col3File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]); 2679 testSingleFieldOperator!FirstOperator(col3File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]); 2680 testSingleFieldOperator!FirstOperator(col3File, 2, "first", ["", "r1c3", "r1c3", "r1c3"]); 2681 2682 auto col1misFile = [[""], ["r2c1"], ["r3c1"]]; 2683 testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "", "r2c1", "r2c1"], 2684 new MissingFieldPolicy(true, "")); // Exclude missing 2685 testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "NA", "NA", "NA"], 2686 new MissingFieldPolicy(false, "NA")); // Replace missing 2687 } 2688 2689 /** LastOperator outputs the last value found for the field. 2690 */ 2691 final class LastOperator : SingleFieldOperator 2692 { 2693 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2694 { 2695 super("last", fieldIndex, missingPolicy); 2696 } 2697 2698 final override SingleFieldCalculator makeCalculator() 2699 { 2700 return new LastCalculator(fieldIndex); 2701 } 2702 2703 final class LastCalculator : SingleFieldCalculator 2704 { 2705 private string _value = ""; 2706 2707 this(size_t fieldIndex) 2708 { 2709 super(fieldIndex); 2710 } 2711 2712 final override LastOperator getOperator() 2713 { 2714 return this.outer; 2715 } 2716 2717 final override void processNextField(const char[] nextField) 2718 { 2719 _value = nextField.to!string; 2720 } 2721 2722 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2723 { 2724 return _value; 2725 } 2726 } 2727 } 2728 2729 unittest // LastOperator 2730 { 2731 auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]]; 2732 auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]]; 2733 auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]]; 2734 2735 testSingleFieldOperator!LastOperator(col1File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]); 2736 testSingleFieldOperator!LastOperator(col2File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]); 2737 testSingleFieldOperator!LastOperator(col2File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]); 2738 testSingleFieldOperator!LastOperator(col3File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]); 2739 testSingleFieldOperator!LastOperator(col3File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]); 2740 testSingleFieldOperator!LastOperator(col3File, 2, "last", ["", "r1c3", "r2c3", "r3c3"]); 2741 2742 auto col1misFile = [[""], ["r2c1"], ["r3c1"]]; 2743 testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "", "r2c1", "r3c1"], 2744 new MissingFieldPolicy(true, "")); // Exclude missing 2745 testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "NA", "r2c1", "r3c1"], 2746 new MissingFieldPolicy(false, "NA")); // Replace missing 2747 } 2748 2749 /** MinOperator output the minimum value for the field. This is a numeric operator. 2750 */ 2751 final class MinOperator : SingleFieldOperator 2752 { 2753 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2754 { 2755 super("min", fieldIndex, missingPolicy); 2756 } 2757 2758 final override SingleFieldCalculator makeCalculator() 2759 { 2760 return new MinCalculator(fieldIndex); 2761 } 2762 2763 final class MinCalculator : SingleFieldCalculator 2764 { 2765 private bool _isFirst = true; 2766 private double _value = double.nan; 2767 2768 this(size_t fieldIndex) 2769 { 2770 super(fieldIndex); 2771 } 2772 2773 final override MinOperator getOperator() 2774 { 2775 return this.outer; 2776 } 2777 2778 final override void processNextField(const char[] nextField) 2779 { 2780 double fieldValue = nextField.to!double; 2781 if (_isFirst) 2782 { 2783 _value = fieldValue; 2784 _isFirst = false; 2785 } 2786 else if (fieldValue < _value) 2787 { 2788 _value = fieldValue; 2789 } 2790 } 2791 2792 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2793 { 2794 return printOptions.formatNumber(_value); 2795 } 2796 } 2797 } 2798 2799 unittest // MinOperator 2800 { 2801 auto col1File = [["10"], ["9.5"], ["11"]]; 2802 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 2803 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 2804 2805 testSingleFieldOperator!MinOperator(col1File, 0, "min", ["nan", "10", "9.5", "9.5"]); 2806 testSingleFieldOperator!MinOperator(col2File, 0, "min", ["nan", "20", "20", "20"]); 2807 testSingleFieldOperator!MinOperator(col2File, 1, "min", ["nan", "-30", "-30", "-31"]); 2808 testSingleFieldOperator!MinOperator(col3File, 0, "min", ["nan", "9009", "199", "199"]); 2809 testSingleFieldOperator!MinOperator(col3File, 1, "min", ["nan", "9", "0", "0"]); 2810 testSingleFieldOperator!MinOperator(col3File, 2, "min", ["nan", "-4.5", "-4.5", "-4.5"]); 2811 2812 auto col1misFile = [[""], ["10"], ["-10"]]; 2813 testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "nan", "10", "-10"], 2814 new MissingFieldPolicy(true, "")); // Exclude missing 2815 testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "5", "5", "-10"], 2816 new MissingFieldPolicy(false, "5")); // Replace missing 2817 } 2818 2819 /** MaxOperator output the maximum value for the field. This is a numeric operator. 2820 */ 2821 final class MaxOperator : SingleFieldOperator 2822 { 2823 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2824 { 2825 super("max", fieldIndex, missingPolicy); 2826 } 2827 2828 final override SingleFieldCalculator makeCalculator() 2829 { 2830 return new MaxCalculator(fieldIndex); 2831 } 2832 2833 final class MaxCalculator : SingleFieldCalculator 2834 { 2835 private bool _isFirst = true; 2836 private double _value = double.nan; 2837 2838 this(size_t fieldIndex) 2839 { 2840 super(fieldIndex); 2841 } 2842 2843 final override MaxOperator getOperator() 2844 { 2845 return this.outer; 2846 } 2847 2848 final override void processNextField(const char[] nextField) 2849 { 2850 double fieldValue = nextField.to!double; 2851 if (_isFirst) 2852 { 2853 _value = fieldValue; 2854 _isFirst = false; 2855 } 2856 else if (fieldValue > _value) 2857 { 2858 _value = fieldValue; 2859 } 2860 } 2861 2862 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2863 { 2864 return printOptions.formatNumber(_value); 2865 } 2866 } 2867 } 2868 2869 unittest // MaxOperator 2870 { 2871 auto col1File = [["10"], ["9.5"], ["11"]]; 2872 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 2873 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 2874 2875 testSingleFieldOperator!MaxOperator(col1File, 0, "max", ["nan", "10", "10", "11"]); 2876 testSingleFieldOperator!MaxOperator(col2File, 0, "max", ["nan", "20", "21", "22"]); 2877 testSingleFieldOperator!MaxOperator(col2File, 1, "max", ["nan", "-30", "-29", "-29"]); 2878 testSingleFieldOperator!MaxOperator(col3File, 0, "max", ["nan", "9009", "9009", "9009"]); 2879 testSingleFieldOperator!MaxOperator(col3File, 1, "max", ["nan", "9", "9", "9"]); 2880 testSingleFieldOperator!MaxOperator(col3File, 2, "max", ["nan", "-4.5", "-0.5", "12"]); 2881 2882 auto col1misFile = [[""], ["-10"], ["10"]]; 2883 testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "nan", "-10", "10"], 2884 new MissingFieldPolicy(true, "")); // Exclude missing 2885 testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "5", "5", "10"], 2886 new MissingFieldPolicy(false, "5")); // Replace missing 2887 } 2888 2889 /** RangeOperator outputs the difference between the minimum and maximum values. 2890 * 2891 * If there is a single value, or all values are the same, the range is zero. This is 2892 * a numeric operator. 2893 */ 2894 final class RangeOperator : SingleFieldOperator 2895 { 2896 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2897 { 2898 super("range", fieldIndex, missingPolicy); 2899 } 2900 2901 final override SingleFieldCalculator makeCalculator() 2902 { 2903 return new RangeCalculator(fieldIndex); 2904 } 2905 2906 final class RangeCalculator : SingleFieldCalculator 2907 { 2908 private bool _isFirst = true; 2909 private double _minValue = 0.0; 2910 private double _maxValue = 0.0; 2911 2912 this(size_t fieldIndex) 2913 { 2914 super(fieldIndex); 2915 } 2916 2917 final override RangeOperator getOperator() 2918 { 2919 return this.outer; 2920 } 2921 2922 final override void processNextField(const char[] nextField) 2923 { 2924 double fieldValue = nextField.to!double; 2925 if (_isFirst) 2926 { 2927 _minValue = _maxValue = fieldValue; 2928 _isFirst = false; 2929 } 2930 else if (fieldValue > _maxValue) 2931 { 2932 _maxValue = fieldValue; 2933 } 2934 else if (fieldValue < _minValue) 2935 { 2936 _minValue = fieldValue; 2937 } 2938 } 2939 2940 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2941 { 2942 return printOptions.formatNumber(_maxValue - _minValue); 2943 } 2944 } 2945 } 2946 2947 unittest // RangeOperator 2948 { 2949 auto col1File = [["10"], ["9.5"], ["11"]]; 2950 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 2951 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 2952 2953 testSingleFieldOperator!RangeOperator(col1File, 0, "range", ["0", "0", "0.5", "1.5"]); 2954 testSingleFieldOperator!RangeOperator(col2File, 0, "range", ["0", "0", "1", "2"]); 2955 testSingleFieldOperator!RangeOperator(col2File, 1, "range", ["0", "0", "1", "2"]); 2956 testSingleFieldOperator!RangeOperator(col3File, 0, "range", ["0", "0", "8810", "8810"]); 2957 testSingleFieldOperator!RangeOperator(col3File, 1, "range", ["0", "0", "9", "9"]); 2958 testSingleFieldOperator!RangeOperator(col3File, 2, "range", ["0", "0", "4", "16.5"]); 2959 2960 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]]; 2961 testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "0", "0", "0.5", "1.5"], 2962 new MissingFieldPolicy(true, "")); // Exclude missing 2963 testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "4.5", "4.5", "4.5", "5.5"], 2964 new MissingFieldPolicy(false, "5.5")); // Replace missing 2965 } 2966 2967 /** SumOperator produces the sum of all the values. This is a numeric operator. 2968 */ 2969 final class SumOperator : SingleFieldOperator 2970 { 2971 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2972 { 2973 super("sum", fieldIndex, missingPolicy); 2974 } 2975 2976 final override SingleFieldCalculator makeCalculator() 2977 { 2978 return new SumCalculator(fieldIndex); 2979 } 2980 2981 final class SumCalculator : SingleFieldCalculator 2982 { 2983 private double _total = 0.0; 2984 2985 this(size_t fieldIndex) 2986 { 2987 super(fieldIndex); 2988 } 2989 2990 final override SumOperator getOperator() 2991 { 2992 return this.outer; 2993 } 2994 2995 final override void processNextField(const char[] nextField) 2996 { 2997 _total += nextField.to!double; 2998 } 2999 3000 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3001 { 3002 return printOptions.formatNumber(_total); 3003 } 3004 } 3005 } 3006 3007 unittest // SumOperator 3008 { 3009 auto col1File = [["10"], ["9.5"], ["11"]]; 3010 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3011 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 3012 3013 testSingleFieldOperator!SumOperator(col1File, 0, "sum", ["0", "10", "19.5", "30.5"]); 3014 testSingleFieldOperator!SumOperator(col2File, 0, "sum", ["0", "20", "41", "63"]); 3015 testSingleFieldOperator!SumOperator(col2File, 1, "sum", ["0", "-30", "-59", "-90"]); 3016 testSingleFieldOperator!SumOperator(col3File, 0, "sum", ["0", "9009", "9208", "12211"]); 3017 testSingleFieldOperator!SumOperator(col3File, 1, "sum", ["0", "9", "9", "9.2"]); 3018 testSingleFieldOperator!SumOperator(col3File, 2, "sum", ["0", "-4.5", "-5", "7"]); 3019 3020 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]]; 3021 testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "0", "10", "10", "19.5", "30.5"], 3022 new MissingFieldPolicy(true, "")); // Exclude missing 3023 testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "1.5", "11.5", "13", "22.5", "33.5"], 3024 new MissingFieldPolicy(false, "1.5")); // Replace missing 3025 } 3026 3027 /** MeanOperator produces the mean (average) of all the values. This is a numeric operator. 3028 */ 3029 final class MeanOperator : SingleFieldOperator 3030 { 3031 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3032 { 3033 super("mean", fieldIndex, missingPolicy); 3034 } 3035 3036 final override SingleFieldCalculator makeCalculator() 3037 { 3038 return new MeanCalculator(fieldIndex); 3039 } 3040 3041 final class MeanCalculator : SingleFieldCalculator 3042 { 3043 private double _total = 0.0; 3044 private size_t _count = 0; 3045 3046 this(size_t fieldIndex) 3047 { 3048 super(fieldIndex); 3049 } 3050 3051 final override MeanOperator getOperator() 3052 { 3053 return this.outer; 3054 } 3055 3056 final override void processNextField(const char[] nextField) 3057 { 3058 _total += nextField.to!double; 3059 _count++; 3060 } 3061 3062 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3063 { 3064 return printOptions.formatNumber( 3065 (_count > 0) ? (_total / _count.to!double) : double.nan); 3066 } 3067 } 3068 } 3069 3070 unittest // MeanOperator 3071 { 3072 auto col1File = [["10"], ["9.5"], ["7.5"]]; 3073 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3074 auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]]; 3075 3076 testSingleFieldOperator!MeanOperator(col1File, 0, "mean", ["nan", "10", "9.75", "9"]); 3077 testSingleFieldOperator!MeanOperator(col2File, 0, "mean", ["nan", "20", "20.5", "21"]); 3078 testSingleFieldOperator!MeanOperator(col2File, 1, "mean", ["nan", "-30", "-29.5", "-30"]); 3079 testSingleFieldOperator!MeanOperator(col3File, 0, "mean", ["nan", "9009", "4509", "4509"]); 3080 testSingleFieldOperator!MeanOperator(col3File, 1, "mean", ["nan", "9", "4.5", "2"]); 3081 testSingleFieldOperator!MeanOperator(col3File, 2, "mean", ["nan", "-4.5", "-3", "2"]); 3082 3083 auto col1misFile = [[""], ["6"], [""], ["14"], ["40"]]; 3084 testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "nan", "6", "6", "10", "20"], 3085 new MissingFieldPolicy(true, "")); // Exclude missing 3086 testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "0", "3", "2", "5", "12"], 3087 new MissingFieldPolicy(false, "0")); // Replace missing 3088 } 3089 3090 /** MedianOperator produces the median of all the values. This is a numeric operator. 3091 * 3092 * All the field values are stored in memory as part of this calculation. This is 3093 * handled by unique key value lists. 3094 */ 3095 final class MedianOperator : SingleFieldOperator 3096 { 3097 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3098 { 3099 super("median", fieldIndex, missingPolicy); 3100 setSaveFieldValuesNumeric(); 3101 } 3102 3103 final override SingleFieldCalculator makeCalculator() 3104 { 3105 return new MedianCalculator(fieldIndex); 3106 } 3107 3108 final class MedianCalculator : SingleFieldCalculator 3109 { 3110 this(size_t fieldIndex) 3111 { 3112 super(fieldIndex); 3113 } 3114 3115 final override MedianOperator getOperator() 3116 { 3117 return this.outer; 3118 } 3119 3120 /* Work is done by saving the field values. */ 3121 final override void processNextField(const char[] nextField) 3122 { } 3123 3124 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3125 { 3126 return printOptions.formatNumber(valuesLists.numericValuesMedian(fieldIndex)); 3127 } 3128 } 3129 } 3130 3131 unittest // MedianOperator 3132 { 3133 auto col1File = [["10"], ["9.5"], ["7.5"]]; 3134 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3135 auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]]; 3136 3137 testSingleFieldOperator!MedianOperator(col1File, 0, "median", ["nan", "10", "9.75", "9.5"]); 3138 testSingleFieldOperator!MedianOperator(col2File, 0, "median", ["nan", "20", "20.5", "21"]); 3139 testSingleFieldOperator!MedianOperator(col2File, 1, "median", ["nan", "-30", "-29.5", "-30"]); 3140 testSingleFieldOperator!MedianOperator(col3File, 0, "median", ["nan", "9009", "4509", "4509"]); 3141 testSingleFieldOperator!MedianOperator(col3File, 1, "median", ["nan", "9", "4.5", "0"]); 3142 testSingleFieldOperator!MedianOperator(col3File, 2, "median", ["nan", "-4.5", "-3", "-1.5"]); 3143 3144 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]]; 3145 testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "nan", "10", "10", "9.75", "9.5"], 3146 new MissingFieldPolicy(true, "")); // Exclude missing 3147 testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "0", "5", "0", "4.75", "7.5"], 3148 new MissingFieldPolicy(false, "0")); // Replace missing 3149 } 3150 3151 /** QuantileOperator produces the value representing the data at a cummulative probability. 3152 * This is a numeric operation. 3153 * 3154 * As an example, quantiles might be produced for the 0.25, 0.5, and 0.75 probabilities 3155 * (alternately, the 25th, 50th, and 75th percentile ranks, the 50th percentile being the 3156 * median). Data is sorted is ascending order. This operator takes one percentile, but it 3157 * is common to generate multiple quantile ranks for the same field when summarizing. 3158 * 3159 * All the field's values are stored in memory as part of this calculation. This is 3160 * handled by unique key value lists. 3161 */ 3162 final class QuantileOperator : SingleFieldOperator 3163 { 3164 private double _prob; 3165 3166 this(size_t fieldIndex, MissingFieldPolicy missingPolicy, double probability) 3167 { 3168 assert(0.0 <= probability && probability <= 1.0); 3169 import std.format : format; 3170 3171 string header = (probability == 0.0) ? "pct0" : format("pct%02g", probability * 100.0); 3172 super(header, fieldIndex, missingPolicy); 3173 _prob = probability; 3174 setSaveFieldValuesNumeric(); 3175 } 3176 3177 final override SingleFieldCalculator makeCalculator() 3178 { 3179 return new QuantileCalculator(fieldIndex); 3180 } 3181 3182 final class QuantileCalculator : SingleFieldCalculator 3183 { 3184 this(size_t fieldIndex) 3185 { 3186 super(fieldIndex); 3187 } 3188 3189 final override QuantileOperator getOperator() 3190 { 3191 return this.outer; 3192 } 3193 3194 /* Work is done by saving the field values. */ 3195 final override void processNextField(const char[] nextField) 3196 { } 3197 3198 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3199 { 3200 import tsv_utils.common.numerics : quantile; 3201 return printOptions.formatNumber( 3202 quantile(this.outer._prob, valuesLists.numericValuesSorted(fieldIndex))); 3203 } 3204 } 3205 } 3206 3207 unittest // QuantileOperator 3208 { 3209 auto col1File = [["10"], ["9.5"], ["7.5"]]; 3210 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3211 auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]]; 3212 3213 auto defaultMissing = new MissingFieldPolicy; 3214 3215 /* Same as the median tests. */ 3216 testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct50", ["nan", "10", "9.75", "9.5"], defaultMissing, 0.50); 3217 testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct50", ["nan", "20", "20.5", "21"], defaultMissing, 0.50); 3218 testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct50", ["nan", "-30", "-29.5", "-30"], defaultMissing, 0.50); 3219 testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct50", ["nan", "9009", "4509", "4509"], defaultMissing, 0.50); 3220 testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct50", ["nan", "9", "4.5", "0"], defaultMissing, 0.50); 3221 testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct50", ["nan", "-4.5", "-3", "-1.5"], defaultMissing, 0.50); 3222 3223 /* The extremes (0, 1), are min and max. */ 3224 testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct0", ["nan", "10", "9.5", "7.5"], defaultMissing, 0.0); 3225 testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct0", ["nan", "20", "20", "20"], defaultMissing, 0.0); 3226 testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct0", ["nan", "-30", "-30", "-31"], defaultMissing, 0.0); 3227 testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct0", ["nan", "9009", "9", "9"], defaultMissing, 0.0); 3228 testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct0", ["nan", "9", "0", "-3"], defaultMissing, 0.0); 3229 testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct0", ["nan", "-4.5", "-4.5", "-4.5"], defaultMissing, 0.0); 3230 3231 testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct100", ["nan", "10", "10", "10"], defaultMissing, 1.0); 3232 testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct100", ["nan", "20", "21", "22"], defaultMissing, 1.0); 3233 testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct100", ["nan", "-30", "-29", "-29"], defaultMissing, 1.0); 3234 testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct100", ["nan", "9009", "9009", "9009"], defaultMissing, 1.0); 3235 testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct100", ["nan", "9", "9", "9"], defaultMissing, 1.0); 3236 testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct100", ["nan", "-4.5", "-1.5", "12"], defaultMissing, 1.0); 3237 3238 /* For missing policies, re-use the median tests. */ 3239 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]]; 3240 testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "nan", "10", "10", "9.75", "9.5"], 3241 new MissingFieldPolicy(true, ""), 0.5); // Exclude missing 3242 testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "0", "5", "0", "4.75", "7.5"], 3243 new MissingFieldPolicy(false, "0"), 0.5); // Replace missing 3244 } 3245 3246 /** MadOperator produces the median absolute deviation from the median. This is a numeric 3247 * operation. 3248 * 3249 * The result is the raw MAD value, without a normalization applied. 3250 * 3251 * All the field values are stored in memory as part of this calculation. This is 3252 * handled by unique key value lists. 3253 */ 3254 final class MadOperator : SingleFieldOperator 3255 { 3256 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3257 { 3258 super("mad", fieldIndex, missingPolicy); 3259 setSaveFieldValuesNumeric(); 3260 } 3261 3262 final override SingleFieldCalculator makeCalculator() 3263 { 3264 return new MadCalculator(fieldIndex); 3265 } 3266 3267 final class MadCalculator : SingleFieldCalculator 3268 { 3269 this(size_t fieldIndex) 3270 { 3271 super(fieldIndex); 3272 } 3273 3274 final override MadOperator getOperator() 3275 { 3276 return this.outer; 3277 } 3278 3279 /* Work is done by saving the field values. */ 3280 final override void processNextField(const char[] nextField) 3281 { } 3282 3283 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3284 { 3285 import std.math : abs; 3286 import tsv_utils.common.numerics : rangeMedian; 3287 3288 auto median = valuesLists.numericValuesMedian(fieldIndex); 3289 auto values = valuesLists.numericValues(fieldIndex); 3290 auto medianDevs = new double[values.length]; 3291 foreach (size_t i, double v; values) 3292 medianDevs[i] = abs(v - median); 3293 3294 return printOptions.formatNumber(medianDevs.rangeMedian); 3295 } 3296 } 3297 } 3298 3299 unittest // MadOperator 3300 { 3301 auto col1File = [["10"], ["15"], ["20"], ["25"], ["30"]]; 3302 auto col2File = [["2", "50"], ["2", "51"], ["2", "52"]]; 3303 auto col3File = [["16", "8", "-4"], ["8", "8", "-2"], ["8", "16", "0"]]; 3304 3305 testSingleFieldOperator!MadOperator(col1File, 0, "mad", ["nan", "0", "2.5", "5", "5", "5"]); 3306 testSingleFieldOperator!MadOperator(col2File, 0, "mad", ["nan", "0", "0", "0"]); 3307 testSingleFieldOperator!MadOperator(col2File, 1, "mad", ["nan", "0", "0.5", "1"]); 3308 testSingleFieldOperator!MadOperator(col3File, 0, "mad", ["nan", "0", "4", "0"]); 3309 testSingleFieldOperator!MadOperator(col3File, 1, "mad", ["nan", "0", "0", "0"]); 3310 testSingleFieldOperator!MadOperator(col3File, 2, "mad", ["nan", "0", "1", "2"]); 3311 3312 auto col1misFile = [[""], ["16"], [""], ["32"], ["-4"]]; 3313 testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "nan", "0", "0", "8", "16"], 3314 new MissingFieldPolicy(true, "")); // Exclude missing 3315 testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "0", "8", "0", "8", "4"], 3316 new MissingFieldPolicy(false, "0")); // Replace missing 3317 } 3318 3319 /** Generates the variance of the fields values. This is a numeric operator. 3320 */ 3321 final class VarianceOperator : SingleFieldOperator 3322 { 3323 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3324 { 3325 super("var", fieldIndex, missingPolicy); 3326 } 3327 3328 final override SingleFieldCalculator makeCalculator() 3329 { 3330 return new VarianceCalculator(fieldIndex); 3331 } 3332 3333 final class VarianceCalculator : SingleFieldCalculator 3334 { 3335 private double _count = 0.0; 3336 private double _mean = 0.0; 3337 private double _m2 = 0.0; // Sum of squares of differences from current mean 3338 3339 this(size_t fieldIndex) 3340 { 3341 super(fieldIndex); 3342 } 3343 3344 final override VarianceOperator getOperator() 3345 { 3346 return this.outer; 3347 } 3348 3349 final override void processNextField(const char[] nextField) 3350 { 3351 _count += 1.0; 3352 double fieldValue = nextField.to!double; 3353 double delta = fieldValue - _mean; 3354 _mean += delta / _count; 3355 _m2 += delta * (fieldValue - _mean); 3356 } 3357 3358 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3359 { 3360 return printOptions.formatNumber( 3361 (_count >= 2.0) ? (_m2 / (_count - 1.0)) : double.nan); 3362 } 3363 } 3364 } 3365 3366 unittest // VarianceOperator 3367 { 3368 auto col1File = [["5"], ["10"], ["15"]]; 3369 auto col2File = [["-5", "-5"], ["-10", "0"], ["-15", "5"]]; 3370 auto col3File = [["1", "2", "100"], ["2", "3", "100"], ["3", "4", "103"]]; 3371 3372 testSingleFieldOperator!VarianceOperator(col1File, 0, "var", ["nan", "nan", "12.5", "25"]); 3373 testSingleFieldOperator!VarianceOperator(col2File, 0, "var", ["nan", "nan", "12.5", "25"]); 3374 testSingleFieldOperator!VarianceOperator(col2File, 1, "var", ["nan", "nan", "12.5", "25"]); 3375 testSingleFieldOperator!VarianceOperator(col3File, 0, "var", ["nan", "nan", "0.5", "1"]); 3376 testSingleFieldOperator!VarianceOperator(col3File, 1, "var", ["nan", "nan", "0.5", "1"]); 3377 testSingleFieldOperator!VarianceOperator(col3File, 2, "var", ["nan", "nan", "0", "3"]); 3378 3379 auto col1misFile = [["5"], ["10"], [""]]; 3380 testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "12.5"], 3381 new MissingFieldPolicy(true, "")); // Exclude missing 3382 testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "25"], 3383 new MissingFieldPolicy(false, "15")); // Replace missing 3384 } 3385 3386 /** Generates the standard deviation of the fields values. This is a numeric operator. 3387 */ 3388 final class StDevOperator : SingleFieldOperator 3389 { 3390 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3391 { 3392 super("stdev", fieldIndex, missingPolicy); 3393 } 3394 3395 final override SingleFieldCalculator makeCalculator() 3396 { 3397 return new StDevCalculator(fieldIndex); 3398 } 3399 3400 final class StDevCalculator : SingleFieldCalculator 3401 { 3402 private double _count = 0.0; 3403 private double _mean = 0.0; 3404 private double _m2 = 0.0; // Sum of squares of differences from current mean 3405 3406 this(size_t fieldIndex) 3407 { 3408 super(fieldIndex); 3409 } 3410 3411 final override StDevOperator getOperator() 3412 { 3413 return this.outer; 3414 } 3415 3416 final override void processNextField(const char[] nextField) 3417 { 3418 _count += 1.0; 3419 double fieldValue = nextField.to!double; 3420 double delta = fieldValue - _mean; 3421 _mean += delta / _count; 3422 _m2 += delta * (fieldValue - _mean); 3423 } 3424 3425 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3426 { 3427 import std.math : sqrt; 3428 return printOptions.formatNumber( 3429 (_count >= 2.0) ? (_m2 / (_count - 1.0)).sqrt : double.nan); 3430 } 3431 } 3432 } 3433 3434 /* StDevOperator unit tests - These would be improved with a tolerance option. 3435 */ 3436 unittest 3437 { 3438 auto col1File = [["1"], ["4"], ["7"]]; 3439 auto col2File = [["3", "3"], ["3", "9"], ["7", "15"]]; 3440 auto col3File = [["11", "10", "10"], ["24", "22", "25"], ["37", "34", "40"]]; 3441 3442 testSingleFieldOperator!StDevOperator(col1File, 0, "stdev", ["nan", "nan", "2.12132034356", "3"]); 3443 testSingleFieldOperator!StDevOperator(col2File, 0, "stdev", ["nan", "nan", "0", "2.30940107676"]); 3444 testSingleFieldOperator!StDevOperator(col2File, 1, "stdev", ["nan", "nan", "4.24264068712", "6"]); 3445 testSingleFieldOperator!StDevOperator(col3File, 0, "stdev", ["nan", "nan", "9.19238815543", "13"]); 3446 testSingleFieldOperator!StDevOperator(col3File, 1, "stdev", ["nan", "nan", "8.48528137424", "12"]); 3447 testSingleFieldOperator!StDevOperator(col3File, 2, "stdev", ["nan", "nan", "10.6066017178", "15"]); 3448 3449 auto col1misFile = [["1"], ["4"], [""]]; 3450 testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "2.12132034356"], 3451 new MissingFieldPolicy(true, "")); // Exclude missing 3452 testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "3"], 3453 new MissingFieldPolicy(false, "7")); // Replace missing 3454 } 3455 3456 /** UniqueCountOperator generates the number of unique values. Unique values are 3457 * based on exact text match calculation, not a numeric comparison. 3458 * 3459 * All the unique field values are stored in memory as part of this calculation. 3460 */ 3461 final class UniqueCountOperator : SingleFieldOperator 3462 { 3463 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3464 { 3465 super("unique_count", fieldIndex, missingPolicy); 3466 } 3467 3468 final override SingleFieldCalculator makeCalculator() 3469 { 3470 return new UniqueCountCalculator(fieldIndex); 3471 } 3472 3473 final class UniqueCountCalculator : SingleFieldCalculator 3474 { 3475 private bool[string] _values; 3476 3477 this(size_t fieldIndex) 3478 { 3479 super(fieldIndex); 3480 } 3481 3482 final override UniqueCountOperator getOperator() 3483 { 3484 return this.outer; 3485 } 3486 3487 final override void processNextField(const char[] nextField) 3488 { 3489 if (nextField !in _values) _values[nextField.to!string] = true; 3490 } 3491 3492 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3493 { 3494 return printOptions.formatNumber(_values.length); 3495 } 3496 } 3497 } 3498 3499 unittest // UniqueCount 3500 { 3501 auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]]; 3502 auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]]; 3503 auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]]; 3504 3505 testSingleFieldOperator!UniqueCountOperator(col1File, 0, "unique_count", ["0", "1", "2", "3", "3", "3", "3", "3", "4"]); 3506 testSingleFieldOperator!UniqueCountOperator(col2File, 0, "unique_count", ["0", "1", "2", "2"]); 3507 testSingleFieldOperator!UniqueCountOperator(col2File, 1, "unique_count", ["0", "1", "1", "2"]); 3508 testSingleFieldOperator!UniqueCountOperator(col3File, 0, "unique_count", ["0", "1", "2", "3"]); 3509 testSingleFieldOperator!UniqueCountOperator(col3File, 1, "unique_count", ["0", "1", "2", "2"]); 3510 testSingleFieldOperator!UniqueCountOperator(col3File, 2, "unique_count", ["0", "1", "2", "3"]); 3511 3512 auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]]; 3513 testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "0", "1", "1", "2", "3", "3", "3", "3", "3", "4"], 3514 new MissingFieldPolicy(true, "")); // Exclude missing 3515 3516 3517 testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "1", "2", "2", "3", "4", "4", "4", "4", "4", "5"], 3518 new MissingFieldPolicy(false, "XYZ")); // Replace missing 3519 } 3520 3521 /** MissingCountOperator generates the number of missing values. This overrides 3522 * the global missingFieldsPolicy. 3523 */ 3524 final class MissingCountOperator : SingleFieldOperator 3525 { 3526 private MissingFieldPolicy _globalMissingPolicy; 3527 3528 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3529 { 3530 _globalMissingPolicy = missingPolicy; 3531 super("missing_count", fieldIndex, new MissingFieldPolicy(false, "")); 3532 } 3533 3534 final override SingleFieldCalculator makeCalculator() 3535 { 3536 return new MissingCountCalculator(fieldIndex); 3537 } 3538 3539 final class MissingCountCalculator : SingleFieldCalculator 3540 { 3541 private size_t _missingCount = 0; 3542 3543 this(size_t fieldIndex) 3544 { 3545 super(fieldIndex); 3546 } 3547 3548 final override MissingCountOperator getOperator() 3549 { 3550 return this.outer; 3551 } 3552 3553 final override void processNextField(const char[] nextField) 3554 { 3555 if (this.outer._globalMissingPolicy.isMissingField(nextField)) _missingCount++; 3556 } 3557 3558 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3559 { 3560 return printOptions.formatNumber(_missingCount); 3561 } 3562 } 3563 } 3564 3565 unittest // MissingCount 3566 { 3567 auto col1File = [["a"], ["b"], [""], [" "], [""]]; 3568 auto col2File = [["abc", ""], ["", ""], ["def", ""]]; 3569 auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]]; 3570 3571 testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"]); 3572 testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"]); 3573 testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"]); 3574 testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"]); 3575 testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"]); 3576 testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"]); 3577 3578 auto excludeMissing = new MissingFieldPolicy(true, ""); 3579 auto replaceMissing = new MissingFieldPolicy(false, "X"); 3580 3581 testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], excludeMissing); 3582 testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], excludeMissing); 3583 testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], excludeMissing); 3584 testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], excludeMissing); 3585 testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], excludeMissing); 3586 testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], excludeMissing); 3587 3588 testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], replaceMissing); 3589 testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], replaceMissing); 3590 testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], replaceMissing); 3591 testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], replaceMissing); 3592 testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], replaceMissing); 3593 testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], replaceMissing); 3594 } 3595 3596 /** NotMissingCountOperator generates the number of not-missing values. This overrides 3597 * the global missingFieldsPolicy. 3598 */ 3599 final class NotMissingCountOperator : SingleFieldOperator 3600 { 3601 private MissingFieldPolicy _globalMissingPolicy; 3602 3603 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3604 { 3605 _globalMissingPolicy = missingPolicy; 3606 super("not_missing_count", fieldIndex, new MissingFieldPolicy(false, "")); 3607 } 3608 3609 final override SingleFieldCalculator makeCalculator() 3610 { 3611 return new NotMissingCountCalculator(fieldIndex); 3612 } 3613 3614 final class NotMissingCountCalculator : SingleFieldCalculator 3615 { 3616 private size_t _notMissingCount = 0; 3617 3618 this(size_t fieldIndex) 3619 { 3620 super(fieldIndex); 3621 } 3622 3623 final override NotMissingCountOperator getOperator() 3624 { 3625 return this.outer; 3626 } 3627 3628 final override void processNextField(const char[] nextField) 3629 { 3630 if (!this.outer._globalMissingPolicy.isMissingField(nextField)) _notMissingCount++; 3631 } 3632 3633 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3634 { 3635 return printOptions.formatNumber(_notMissingCount); 3636 } 3637 } 3638 } 3639 3640 unittest // NotMissingCount 3641 { 3642 auto col1File = [["a"], ["b"], [""], [" "], [""]]; 3643 auto col2File = [["abc", ""], ["", ""], ["def", ""]]; 3644 auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]]; 3645 3646 testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"]); 3647 testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"]); 3648 testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"]); 3649 testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"]); 3650 testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"]); 3651 testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"]); 3652 3653 auto excludeMissing = new MissingFieldPolicy(true, ""); 3654 auto replaceMissing = new MissingFieldPolicy(false, "X"); 3655 3656 testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], excludeMissing); 3657 testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], excludeMissing); 3658 testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], excludeMissing); 3659 testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], excludeMissing); 3660 testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], excludeMissing); 3661 testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], excludeMissing); 3662 3663 testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], replaceMissing); 3664 testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], replaceMissing); 3665 testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], replaceMissing); 3666 testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], replaceMissing); 3667 testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], replaceMissing); 3668 testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], replaceMissing); 3669 } 3670 3671 /** ModeOperator outputs the most frequent value seen. In the event of a tie, the 3672 * first value seen is produced. 3673 * 3674 * All the field values are stored in memory as part of this calculation. 3675 * 3676 */ 3677 final class ModeOperator : SingleFieldOperator 3678 { 3679 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3680 { 3681 super("mode", fieldIndex, missingPolicy); 3682 } 3683 3684 final override SingleFieldCalculator makeCalculator() 3685 { 3686 return new ModeCalculator(fieldIndex); 3687 } 3688 3689 final class ModeCalculator : SingleFieldCalculator 3690 { 3691 private size_t[string] _valueCounts; 3692 private Appender!(string[]) _uniqueValues; 3693 3694 this(size_t fieldIndex) 3695 { 3696 super(fieldIndex); 3697 } 3698 3699 final override ModeOperator getOperator() 3700 { 3701 return this.outer; 3702 } 3703 3704 final override void processNextField(const char[] nextField) 3705 { 3706 auto countPtr = (nextField in _valueCounts); 3707 3708 if (countPtr is null) 3709 { 3710 string value = nextField.to!string; 3711 _uniqueValues.put(value); 3712 _valueCounts[value] = 1; 3713 } 3714 else 3715 { 3716 (*countPtr)++; 3717 } 3718 } 3719 3720 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3721 { 3722 string modeValue = ""; 3723 size_t modeCount = 0; 3724 3725 foreach (value; _uniqueValues.data) 3726 { 3727 assert(value in _valueCounts); 3728 3729 auto count = _valueCounts[value]; 3730 3731 if (count > modeCount) 3732 { 3733 modeValue = value; 3734 modeCount = count; 3735 } 3736 } 3737 3738 return modeValue; 3739 } 3740 } 3741 } 3742 3743 unittest // ModeOperator 3744 { 3745 auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]]; 3746 auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]]; 3747 auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]]; 3748 3749 testSingleFieldOperator!ModeOperator(col1File, 0, "mode", ["", "a", "a", "a", "c", "b", "b", "b"]); 3750 testSingleFieldOperator!ModeOperator(col2File, 0, "mode", ["", "abc", "abc", "def"]); 3751 testSingleFieldOperator!ModeOperator(col2File, 1, "mode", ["", "pqr", "pqr", "pqr"]); 3752 testSingleFieldOperator!ModeOperator(col3File, 0, "mode", ["", "1.0", "1.0", "1.0"]); 3753 testSingleFieldOperator!ModeOperator(col3File, 1, "mode", ["", "1", "1", "a"]); 3754 testSingleFieldOperator!ModeOperator(col3File, 2, "mode", ["", "a", "a", "a"]); 3755 3756 auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]]; 3757 testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "", "a", "a", "a", "a", "c", "b", "b"], 3758 new MissingFieldPolicy(true, "")); // Exclude missing 3759 3760 3761 testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "X", "X", "X", "X", "X", "X", "X", "b"], 3762 new MissingFieldPolicy(false, "X")); // Replace missing 3763 } 3764 3765 /** ModeCountOperator outputs the count of the most frequent value seen. 3766 * 3767 * All the field values are stored in memory as part of this calculation. 3768 * 3769 */ 3770 final class ModeCountOperator : SingleFieldOperator 3771 { 3772 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3773 { 3774 super("mode_count", fieldIndex, missingPolicy); 3775 } 3776 3777 final override SingleFieldCalculator makeCalculator() 3778 { 3779 return new ModeCountCalculator(fieldIndex); 3780 } 3781 3782 final class ModeCountCalculator : SingleFieldCalculator 3783 { 3784 private size_t[string] _valueCounts; 3785 3786 this(size_t fieldIndex) 3787 { 3788 super(fieldIndex); 3789 } 3790 3791 final override ModeCountOperator getOperator() 3792 { 3793 return this.outer; 3794 } 3795 3796 final override void processNextField(const char[] nextField) 3797 { 3798 auto countPtr = (nextField in _valueCounts); 3799 3800 if (countPtr is null) 3801 { 3802 string value = nextField.to!string; 3803 _valueCounts[value] = 1; 3804 } 3805 else 3806 { 3807 (*countPtr)++; 3808 } 3809 } 3810 3811 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3812 { 3813 size_t modeCount = 0; 3814 foreach (count; _valueCounts.byValue) if (count > modeCount) modeCount = count; 3815 return printOptions.formatNumber(modeCount); 3816 } 3817 } 3818 } 3819 3820 unittest // ModeCountOperator 3821 { 3822 auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]]; 3823 auto col2File = [["abc", ""], ["def", ""], ["def", "xyz"]]; 3824 auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]]; 3825 3826 testSingleFieldOperator!ModeCountOperator(col1File, 0, "mode_count", ["0", "1", "1", "1", "2", "2", "3", "3"]); 3827 testSingleFieldOperator!ModeCountOperator(col2File, 0, "mode_count", ["0", "1", "1", "2"]); 3828 testSingleFieldOperator!ModeCountOperator(col2File, 1, "mode_count", ["0", "1", "2", "2"]); 3829 testSingleFieldOperator!ModeCountOperator(col3File, 0, "mode_count", ["0", "1", "1", "1"]); 3830 testSingleFieldOperator!ModeCountOperator(col3File, 1, "mode_count", ["0", "1", "1", "2"]); 3831 testSingleFieldOperator!ModeCountOperator(col3File, 2, "mode_count", ["0", "1", "1", "1"]); 3832 3833 auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]]; 3834 testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "0", "1", "1", "1", "1", "2", "2", "3"], 3835 new MissingFieldPolicy(true, "")); // Exclude missing 3836 3837 3838 testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "1", "1", "2", "2", "2", "2", "2", "3"], 3839 new MissingFieldPolicy(false, "X")); // Replace missing 3840 } 3841 3842 /** ValuesOperator outputs each value delimited by an alternate delimiter character. 3843 * 3844 * All the field values are stored in memory as part of this calculation. This is 3845 * handled by unique key value lists. 3846 */ 3847 3848 final class ValuesOperator : SingleFieldOperator 3849 { 3850 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3851 { 3852 super("values", fieldIndex, missingPolicy); 3853 setSaveFieldValuesText(); 3854 } 3855 3856 final override SingleFieldCalculator makeCalculator() 3857 { 3858 return new ValuesCalculator(fieldIndex); 3859 } 3860 3861 final class ValuesCalculator : SingleFieldCalculator 3862 { 3863 this(size_t fieldIndex) 3864 { 3865 super(fieldIndex); 3866 } 3867 3868 final override ValuesOperator getOperator() 3869 { 3870 return this.outer; 3871 } 3872 3873 /* Work is done by saving the field values. */ 3874 final override void processNextField(const char[] nextField) 3875 { } 3876 3877 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3878 { 3879 return valuesLists.textValues(fieldIndex).join(printOptions.valuesDelimiter); 3880 } 3881 } 3882 } 3883 3884 unittest // ValuesOperator 3885 { 3886 auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]]; 3887 auto col2File = [["", "50"], ["", "51"], ["xyz", "52"]]; 3888 auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "---"]]; 3889 3890 testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e|", "a||b|cd|e||a"]); 3891 testSingleFieldOperator!ValuesOperator(col2File, 0, "values", ["", "", "|", "||xyz"]); 3892 testSingleFieldOperator!ValuesOperator(col2File, 1, "values", ["", "50", "50|51", "50|51|52"]); 3893 testSingleFieldOperator!ValuesOperator(col3File, 0, "values", ["", "z", "z|y", "z|y|w"]); 3894 testSingleFieldOperator!ValuesOperator(col3File, 1, "values", ["", "a", "a|ab", "a|ab|ba"]); 3895 testSingleFieldOperator!ValuesOperator(col3File, 2, "values", ["", "-", "-|--", "-|--|---"]); 3896 3897 testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e|a"], 3898 new MissingFieldPolicy(true, "")); // Exclude missing 3899 3900 3901 testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e|X", "a|X|b|cd|e|X|a"], 3902 new MissingFieldPolicy(false, "X")); // Replace missing 3903 } 3904 3905 /** UniqueValuesOperator outputs each unique value delimited by an alternate delimiter 3906 * character. Values are output in the order seen. 3907 * 3908 * All unique field values are stored in memory as part of this calculation. 3909 * 3910 */ 3911 final class UniqueValuesOperator : SingleFieldOperator 3912 { 3913 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3914 { 3915 super("unique_values", fieldIndex, missingPolicy); 3916 } 3917 3918 final override SingleFieldCalculator makeCalculator() 3919 { 3920 return new UniqueValuesCalculator(fieldIndex); 3921 } 3922 3923 final class UniqueValuesCalculator : SingleFieldCalculator 3924 { 3925 private size_t[string] _valuesHash; 3926 private Appender!(string[]) _uniqueValues; 3927 3928 this(size_t fieldIndex) 3929 { 3930 super(fieldIndex); 3931 } 3932 3933 final override UniqueValuesOperator getOperator() 3934 { 3935 return this.outer; 3936 } 3937 3938 final override void processNextField(const char[] nextField) 3939 { 3940 auto ptr = (nextField in _valuesHash); 3941 3942 if (ptr is null) 3943 { 3944 string value = nextField.to!string; 3945 _uniqueValues.put(value); 3946 _valuesHash[value] = 1; 3947 } 3948 } 3949 3950 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3951 { 3952 return _uniqueValues.data.join(printOptions.valuesDelimiter); 3953 } 3954 } 3955 } 3956 3957 unittest // UniqueValuesOperator 3958 { 3959 auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]]; 3960 auto col2File = [["", "50"], ["", "50"], ["xyz", "52"]]; 3961 auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "-"]]; 3962 3963 testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e", "a||b|cd|e"]); 3964 testSingleFieldOperator!UniqueValuesOperator(col2File, 0, "unique_values", ["", "", "", "|xyz"]); 3965 testSingleFieldOperator!UniqueValuesOperator(col2File, 1, "unique_values", ["", "50", "50", "50|52"]); 3966 testSingleFieldOperator!UniqueValuesOperator(col3File, 0, "unique_values", ["", "z", "z|y", "z|y|w"]); 3967 testSingleFieldOperator!UniqueValuesOperator(col3File, 1, "unique_values", ["", "a", "a|ab", "a|ab|ba"]); 3968 testSingleFieldOperator!UniqueValuesOperator(col3File, 2, "unique_values", ["", "-", "-|--", "-|--"]); 3969 3970 testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e"], 3971 new MissingFieldPolicy(true, "")); // Exclude missing 3972 3973 3974 testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e", "a|X|b|cd|e"], 3975 new MissingFieldPolicy(false, "X")); // Replace missing 3976 }