1 /** 2 Command line tool that reads TSV files and summarizes field values associated with 3 equivalent keys. 4 5 Copyright (c) 2016-2018, eBay Software Foundation 6 Initially written by Jon Degenhardt 7 8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 9 */ 10 module tsv_summarize; 11 12 import std.algorithm : all, any, canFind, each, find, findSplit, map, joiner, splitter; 13 import std.array : join; 14 import std.conv : to; 15 import std.format : format; 16 import std.range; 17 import std.stdio; 18 import std.typecons : tuple; 19 import std.container : DList; 20 21 version(unittest) 22 { 23 // When running unit tests, use main from -main compiler switch. 24 } 25 else 26 { 27 int main(string[] cmdArgs) 28 { 29 /* When running in DMD code coverage mode, turn on report merging. */ 30 version(D_Coverage) version(DigitalMars) 31 { 32 import core.runtime : dmd_coverSetMerge; 33 dmd_coverSetMerge(true); 34 } 35 36 TsvSummarizeOptions cmdopt; 37 auto r = cmdopt.processArgs(cmdArgs); 38 if (!r[0]) return r[1]; 39 version(LDC_Profile) 40 { 41 import ldc.profile : resetAll; 42 resetAll(); 43 } 44 try tsvSummarize(cmdopt, cmdArgs[1..$]); 45 catch (Exception exc) 46 { 47 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 48 return 1; 49 } 50 return 0; 51 } 52 } 53 54 auto helpTextVerbose = q"EOS 55 Synopsis: tsv-summarize [options] file [file...] 56 57 tsv-summarize reads tabular data files (tab-separated by default), tracks 58 field values for each unique key, and runs summarization algorithms. Consider 59 the file data.tsv: 60 61 make color time 62 ford blue 131 63 chevy green 124 64 ford red 128 65 bmw black 118 66 bmw black 126 67 ford blue 122 68 69 The min and average times for each make is generated by the command: 70 71 $ tsv-summarize --header --group-by 1 --min 3 --mean 3 data.tsv 72 73 This produces: 74 75 make time_min time_mean 76 ford 122 127 77 chevy 124 124 78 bmw 118 122 79 80 Using '--group 1,2' will group by both 'make' and 'color'. Omitting the 81 '--group-by' entirely summarizes fields for full file. 82 83 The program tries to generate useful headers, but custom headers can be 84 specified. Example (using -g and -H shortcuts for --header and --group-by): 85 86 $ tsv-summarize -H -g 1 --min 3:fastest --mean 3:average data.tsv 87 88 Most operators take custom headers in a similarly way, generally following: 89 90 --<operator-name> FIELD[:header] 91 92 Operators can be specified multiple times. They can also take multiple 93 fields (though not when a custom header is specified). Examples: 94 95 --median 2,3,4 96 --median 2-5,7-11 97 98 The quantile operator requires one or more probabilities after the fields: 99 100 --quantile 2:0.25 // Quantile 1 of field 2 101 --quantile 2-4:0.25,0.5,0.75 // Q1, Median, Q3 of fields 2, 3, 4 102 103 Summarization operators available are: 104 count range mad values 105 retain sum var unique-values 106 first mean stddev unique-count 107 last median mode missing-count 108 min quantile mode-count not-missing-count 109 max 110 111 Numeric values are printed to 12 significant digits by default. This can be 112 changed using the '--p|float-precision' option. If six or less it sets the 113 number of significant digits after the decimal point. If greater than six it 114 sets the total number of significant digits. 115 116 Calculations hold onto the minimum data needed while reading data. A few 117 operations like median keep all data values in memory. These operations will 118 start to encounter performance issues as available memory becomes scarce. The 119 size that can be handled effectively is machine dependent, but often quite 120 large files can be handled. 121 122 Operations requiring numeric entries will signal an error and terminate 123 processing if a non-numeric entry is found. 124 125 Missing values are not treated specially by default, this can be changed 126 using the '--x|exclude-missing' or '--r|replace-missing' option. The former 127 turns off processing for missing values, the latter uses a replacement value. 128 129 Options: 130 EOS"; 131 132 auto helpText = q"EOS 133 Synopsis: tsv-summarize [options] file [file...] 134 135 tsv-summarize runs aggregation operations on fields in tab-separated value 136 files. Operations can be run against the full input data or grouped by key 137 fields. Use --help-verbose for more extensive help. 138 139 Options: 140 EOS"; 141 142 /** Command line options - Container and processing. The processArgs method is used to 143 * process the command line. 144 */ 145 struct TsvSummarizeOptions { 146 string programName; 147 148 /* Options set directly by on the command line.. */ 149 size_t[] keyFields; // -g, --group-by 150 bool hasHeader = false; // --header 151 bool writeHeader = false; // -w, --write-header 152 char inputFieldDelimiter = '\t'; // --d|delimiter 153 char valuesDelimiter = '|'; // --v|values-delimiter 154 size_t floatPrecision = 12; // --p|float-precision 155 bool excludeMissing = false; // --x|exclude-missing 156 string missingValueReplacement; // --r|replace-missing 157 bool helpVerbose = false; // --help-verbose 158 bool versionWanted = false; // --V|version 159 DList!Operator operators; // Operators, in the order specified. 160 size_t endFieldIndex = 0; // Derived value. Max field index used plus one. 161 MissingFieldPolicy globalMissingPolicy = new MissingFieldPolicy; // Derived value. 162 163 /* Returns a tuple. First value is true if command line arguments were successfully 164 * processed and execution should continue, or false if an error occurred or the user 165 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 166 * 167 * Returning true (execution continues) means args have been validated and derived 168 * values calculated. In addition, field indices have been converted to zero-based. 169 */ 170 auto processArgs (ref string[] cmdArgs) { 171 import std.algorithm : any, each; 172 import std.getopt; 173 import std.path : baseName, stripExtension; 174 import std.typecons : Yes, No; 175 import getopt_inorder; 176 import tsvutil : makeFieldListOptionHandler; 177 178 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 179 180 try 181 { 182 arraySep = ","; // Use comma to separate values in command line options 183 auto r = getoptInorder( 184 cmdArgs, 185 "help-verbose", " Print full help.", &helpVerbose, 186 187 std.getopt.config.caseSensitive, 188 "V|version", " Print version information and exit.", &versionWanted, 189 std.getopt.config.caseInsensitive, 190 191 "g|group-by", "<field-list> Fields to use as key.", 192 keyFields.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex), 193 194 std.getopt.config.caseSensitive, 195 "H|header", " Treat the first line of each file as a header.", &hasHeader, 196 std.getopt.config.caseInsensitive, 197 198 "w|write-header", " Write an output header even if there is no input header.", &writeHeader, 199 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &inputFieldDelimiter, 200 "v|values-delimiter", "CHR Values delimiter. Default: vertical bar (|). (Single byte UTF-8 characters only.)", &valuesDelimiter, 201 "p|float-precision", "NUM 'Precision' to use printing floating point numbers. Affects the number of digits printed and exponent use. Default: 12", &floatPrecision, 202 "x|exclude-missing", " Exclude missing (empty) fields from calculations.", &excludeMissing, 203 "r|replace-missing", "STR Replace missing (empty) fields with STR in calculations.", &missingValueReplacement, 204 "count", " Count occurrences of each unique key.", &countOptionHandler, 205 "count-header", "STR Count occurrences of each unique key, use header STR.", &countHeaderOptionHandler, 206 "retain", "<field-list> Retain one copy of the field.", &operatorOptionHandler!RetainOperator, 207 "first", "<field-list>[:STR] First value seen.", &operatorOptionHandler!FirstOperator, 208 "last", "<field-list>[:STR] Last value seen.", &operatorOptionHandler!LastOperator, 209 "min", "<field-list>[:STR] Min value. (Numeric fields only.)", &operatorOptionHandler!MinOperator, 210 "max", "<field-list>[:STR] Max value. (Numeric fields only.)", &operatorOptionHandler!MaxOperator, 211 "range", "<field-list>[:STR] Difference between min and max values. (Numeric fields only.)", &operatorOptionHandler!RangeOperator, 212 "sum", "<field-list>[:STR] Sum of the values. (Numeric fields only.)", &operatorOptionHandler!SumOperator, 213 "mean", "<field-list>[:STR] Mean (average). (Numeric fields only.)", &operatorOptionHandler!MeanOperator, 214 "median", "<field-list>[:STR] Median value. (Numeric fields only. Reads all values into memory.)", &operatorOptionHandler!MedianOperator, 215 "quantile", "<field-list>:p[,p...][:STR] Quantiles. One or more fields, then one or more 0.0-1.0 probabilities. (Numeric fields only. Reads all values into memory.)", &quantileOperatorOptionHandler, 216 "mad", "<field-list>[:STR] Median absolute deviation from the median. Raw value, not scaled. (Numeric fields only. Reads all values into memory.)", &operatorOptionHandler!MadOperator, 217 "var", "<field-list>[:STR] Variance. (Sample variance, numeric fields only).", &operatorOptionHandler!VarianceOperator, 218 "stdev", "<field-list>[:STR] Standard deviation. (Sample st.dev, numeric fields only).", &operatorOptionHandler!StDevOperator, 219 "mode", "<field-list>[:STR] Mode. The most frequent value. (Reads all unique values into memory.)", &operatorOptionHandler!ModeOperator, 220 "mode-count", "<field-list>[:STR] Count of the most frequent value. (Reads all unique values into memory.)", &operatorOptionHandler!ModeCountOperator, 221 "unique-count", "<field-list>[:STR] Number of unique values. (Reads all unique values into memory.)", &operatorOptionHandler!UniqueCountOperator, 222 "missing-count", "<field-list>[:STR] Number of missing (empty) fields. Not affected by '--x|exclude-missing' or '--r|replace-missing'.", &operatorOptionHandler!MissingCountOperator, 223 "not-missing-count", "<field-list>[:STR] Number of filled (non-empty) fields. Not affected by '--r|replace-missing'.", &operatorOptionHandler!NotMissingCountOperator, 224 "values", "<field-list>[:STR] All the values, separated by --v|values-delimiter. (Reads all values into memory.)", &operatorOptionHandler!ValuesOperator, 225 "unique-values", "<field-list>[:STR] All the unique values, separated by --v|values-delimiter. (Reads all unique values into memory.)", &operatorOptionHandler!UniqueValuesOperator, 226 ); 227 228 if (r.helpWanted) 229 { 230 defaultGetoptPrinter(helpText, r.options); 231 return tuple(false, 0); 232 } 233 else if (helpVerbose) 234 { 235 defaultGetoptPrinter(helpTextVerbose, r.options); 236 return tuple(false, 0); 237 } 238 else if (versionWanted) 239 { 240 import tsvutils_version; 241 writeln(tsvutilsVersionNotice("tsv-summarize")); 242 return tuple(false, 0); 243 } 244 245 consistencyValidations(); 246 derivations(); 247 } 248 catch (Exception exc) 249 { 250 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 251 return tuple(false, 1); 252 } 253 return tuple(true, 0); 254 } 255 256 /* operationOptionHandler functions are callbacks that process command line options 257 * specifying summarization operations. eg. '--max 5', '--last 3:LastEntry'. Handlers 258 * check syntactic correctness and instantiate Operator objects that do the work. This 259 * is also where 1-upped field numbers are converted to 0-based indices. 260 */ 261 private void operatorOptionHandler(OperatorClass : SingleFieldOperator)(string option, string optionVal) 262 { 263 import std.range : enumerate; 264 import std.typecons : Yes, No; 265 import tsvutil : parseFieldList; 266 267 auto valSplit = findSplit(optionVal, ":"); 268 269 if (valSplit[0].empty || (!valSplit[1].empty && valSplit[2].empty)) 270 { 271 throw new Exception( 272 format("Invalid option value: '--%s %s'. Expected: '--%s <field-list>' or '--%s <field>:<header>'.", 273 option, optionVal, option, option)); 274 } 275 276 try foreach (fieldNum, fieldIndex; 277 valSplit[0].to!string 278 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex).enumerate(1)) 279 { 280 auto op = new OperatorClass(fieldIndex, globalMissingPolicy); 281 282 if (!valSplit[2].empty) // Header specified 283 { 284 if (fieldNum > 1) 285 { 286 throw new Exception( 287 format("Invalid option: '--%s %s'. Cannot specify a custom header when using multiple fields.", 288 option, optionVal)); 289 } 290 else if (!op.allowCustomHeader) 291 { 292 throw new Exception( 293 format("Invalid option: '--%s %s'. Operator does not support custom headers.", 294 option, optionVal)); 295 } 296 297 op.setCustomHeader(valSplit[2].to!string); 298 } 299 300 operators.insertBack(op); 301 if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1; 302 } 303 catch (Exception exc) 304 { 305 import std.format : format; 306 exc.msg = format("[--%s] %s", option, exc.msg); 307 throw exc; 308 } 309 } 310 311 /* QuantileOperator has a different syntax and needs a custom command option handler. */ 312 private void quantileOperatorOptionHandler(string option, string optionVal) 313 { 314 import std.typecons : Yes, No; 315 import tsvutil : parseFieldList; 316 317 auto formatErrorMsg(string option, string optionVal) 318 { 319 return format( 320 "Invalid option value: '--%s %s'. Expected: '--%s <field-list>:<prob>[,<prob>]' or '--%s <field>:<prob>:<header>' where <prob> is a number between 0.0 and 1.0.", 321 option, optionVal, option, option); 322 } 323 324 auto split1 = findSplit(optionVal, ":"); 325 326 if (split1[0].empty || (!split1[1].empty && split1[2].empty)) 327 throw new Exception(formatErrorMsg(option, optionVal)); 328 329 auto split2 = findSplit(split1[2], ":"); 330 331 if (split2[0].empty || (!split2[1].empty && split2[2].empty)) 332 throw new Exception(formatErrorMsg(option, optionVal)); 333 334 auto fieldStr = split1[0]; 335 auto probStr = split2[0]; 336 auto header = split2[2]; 337 338 size_t[] fieldIndices; 339 double[] probs; 340 341 try foreach (fieldIndex; 342 fieldStr.to!string.parseFieldList!(size_t, Yes.convertToZeroBasedIndex)) 343 { 344 fieldIndices ~= fieldIndex; 345 } 346 catch (Exception exc) 347 { 348 import std.format : format; 349 exc.msg = format("[--%s] %s", option, exc.msg); 350 throw exc; 351 } 352 353 foreach (str; probStr.splitter(',')) 354 { 355 double p; 356 357 try p = str.to!double; 358 catch (Exception exc) 359 throw new Exception(formatErrorMsg(option, optionVal)); 360 361 if (!(p >= 0.0 && p <= 1.0)) 362 throw new Exception( 363 format("Invalid option: '--%s %s'. Probability '%g' is not in the interval [0.0,1.0].", 364 option, optionVal, p)); 365 366 probs ~= p; 367 } 368 369 if (!header.empty && (fieldIndices.length > 1 || probs.length > 1)) 370 { 371 throw new Exception( 372 format("Invalid option: '--%s %s'. Cannot specify a custom header when using multiple fields or multiple probabilities.", 373 option, optionVal)); 374 } 375 376 assert (fieldIndices.length > 0); 377 assert (probs.length > 0); 378 assert (header.empty || (fieldIndices.length == 1 && probs.length == 1)); 379 380 foreach (fieldIndex; fieldIndices) 381 { 382 foreach (p; probs) 383 { 384 auto op = new QuantileOperator(fieldIndex, globalMissingPolicy, p); 385 if (!header.empty) op.setCustomHeader(header); 386 operators.insertBack(op); 387 } 388 if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1; 389 } 390 } 391 392 private void countOptionHandler() 393 { 394 operators.insertBack(new CountOperator()); 395 } 396 397 private void countHeaderOptionHandler(string option, string optionVal) 398 { 399 auto op = new CountOperator(); 400 op.setCustomHeader(optionVal); 401 operators.insertBack(op); 402 } 403 404 /* This routine does validations not handled by processArgs. */ 405 private void consistencyValidations() 406 { 407 if (operators.empty) 408 { 409 throw new Exception("At least one summary operator is required."); 410 } 411 412 if (inputFieldDelimiter == valuesDelimiter) 413 { 414 throw new Exception("Cannot use the same character for both --d|field-delimiter and --v|values-delimiter."); 415 } 416 417 if (excludeMissing && missingValueReplacement.length != 0) 418 { 419 throw new Exception("Cannot use both '--x|exclude-missing' and '--r|replace-missing'."); 420 } 421 } 422 423 /* Post-processing derivations. */ 424 void derivations() 425 { 426 /* keyFields need to part of the endFieldIndex, which is one past the last field index. */ 427 keyFields.each!(delegate (size_t x) { if (x >= endFieldIndex) endFieldIndex = x + 1; } ); 428 429 /* Missing field policy. */ 430 globalMissingPolicy.updatePolicy(excludeMissing, missingValueReplacement); 431 } 432 } 433 434 /** tsvSummarize does the primary work of the tsv-summarize program. 435 */ 436 void tsvSummarize(TsvSummarizeOptions cmdopt, in string[] inputFiles) 437 { 438 import tsvutil : throwIfWindowsNewlineOnUnix; 439 440 /* Pick the Summarizer based on the number of key-fields entered. */ 441 auto summarizer = 442 (cmdopt.keyFields.length == 0) 443 ? new NoKeySummarizer!(typeof(stdout.lockingTextWriter()))( 444 cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 445 446 : (cmdopt.keyFields.length == 1) 447 ? new OneKeySummarizer!(typeof(stdout.lockingTextWriter()))( 448 cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 449 450 : new MultiKeySummarizer!(typeof(stdout.lockingTextWriter()))( 451 cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy); 452 453 /* Add the operators to the Summarizer. */ 454 summarizer.setOperators(inputRangeObject(cmdopt.operators[])); 455 456 /* Process each input file, one line at a time. */ 457 auto lineFields = new char[][](cmdopt.endFieldIndex); 458 bool headerFound = false; 459 foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"]) 460 { 461 auto inputStream = (filename == "-") ? stdin : filename.File(); 462 foreach (lineNum, line; inputStream.byLine.enumerate(1)) 463 { 464 if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, lineNum); 465 466 /* Copy the needed number of fields to the fields array. 467 * Note: The number is zero if no operator needs fields. Notably, the count 468 * operator. Used by itself, it counts the number input lines (ala 'wc -l'). 469 */ 470 if (cmdopt.endFieldIndex > 0) 471 { 472 size_t fieldIndex = 0; 473 foreach (fieldValue; line.splitter(cmdopt.inputFieldDelimiter)) 474 { 475 if (fieldIndex == cmdopt.endFieldIndex) break; 476 lineFields[fieldIndex] = fieldValue; 477 fieldIndex++; 478 } 479 480 if (fieldIndex == 0) 481 { 482 assert(cmdopt.endFieldIndex > 0); 483 assert(line.length == 0); 484 485 /* Bug work-around. Empty lines are not handled properly by splitter. 486 * - Bug: https://issues.dlang.org/show_bug.cgi?id=15735 487 * - Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030 488 * This can arise for: '$ tsv-summarize -g 1 --count'. This counts the 489 * unique values in field 1. If there's only one column, then an empty 490 * line becomes an empty string for field 1. Work-around: Point to the 491 * line. It's an empty string. 492 */ 493 lineFields[fieldIndex] = line; 494 fieldIndex++; 495 } 496 497 if (fieldIndex < cmdopt.endFieldIndex) 498 { 499 throw new Exception( 500 format("Not enough fields in line. File: %s, Line: %s", 501 (filename == "-") ? "Standard Input" : filename, lineNum)); 502 } 503 } 504 505 if (cmdopt.hasHeader && lineNum == 1) 506 { 507 if (!headerFound) 508 { 509 summarizer.processHeaderLine(lineFields); 510 headerFound = true; 511 } 512 } 513 else 514 { 515 /* Process the line. Processing will fail (throw) if a field cannot be 516 * converted to the expected type. 517 */ 518 try summarizer.processNextLine(lineFields); 519 catch (Exception exc) 520 { 521 throw new Exception( 522 format("Could not process line or field: %s\n File: %s Line: %s%s", 523 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum, 524 (lineNum == 1) ? "\n Is this a header line? Use --header to skip." : "")); 525 } 526 } 527 } 528 } 529 530 debug writeln("[tsvSummarize] After reading all data."); 531 532 /* Whew! We're done processing input data. Run the calculations and print. */ 533 auto printOptions = SummarizerPrintOptions( 534 cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision); 535 auto stdoutWriter = stdout.lockingTextWriter; 536 537 if (cmdopt.hasHeader || cmdopt.writeHeader) 538 { 539 summarizer.writeSummaryHeader(stdoutWriter, printOptions); 540 } 541 542 summarizer.writeSummaryBody(stdoutWriter, printOptions); 543 } 544 545 /** The default field header. This is used when the input doesn't have field headers, 546 * but field headers are used in the output. The default is "fieldN", where N is the 547 * 1-upped field number. 548 */ 549 string fieldHeaderFromIndex(size_t fieldIndex) 550 { 551 enum prefix = "field"; 552 return prefix ~ (fieldIndex + 1).to!string; 553 } 554 555 unittest 556 { 557 assert(fieldHeaderFromIndex(0) == "field1"); 558 assert(fieldHeaderFromIndex(10) == "field11"); 559 } 560 561 /** Produce a summary header from a field header. 562 * 563 * The result has the form `<fieldHeader>_<operation>`. e.g. If the field header is 564 * "length" and the operation is "max", the summary header is "length_max". The field 565 * header typically comes a header line in the input data or was constructed by 566 * fieldHeaderFromIndex(). 567 * 568 * If operationName is the empty string, then fieldHeader is used unchanged. This supports 569 * the Retain operator. 570 */ 571 string summaryHeaderFromFieldHeader(string fieldHeader, string operationName) 572 { 573 return (operationName.length > 0) ? fieldHeader ~ "_" ~ operationName : fieldHeader; 574 } 575 576 unittest 577 { 578 assert(summaryHeaderFromFieldHeader("originalfield", "mycalc") == "originalfield_mycalc"); 579 assert(summaryHeaderFromFieldHeader("originalfield", "") == "originalfield"); 580 } 581 582 /** SummarizerPrintOptions holds printing options for Summarizers and Calculators. Typically 583 * specified with command line options, it is separated out for modularity. 584 */ 585 struct SummarizerPrintOptions 586 { 587 char fieldDelimiter; 588 char valuesDelimiter; 589 size_t floatPrecision = 12; 590 591 import std.traits : isFloatingPoint, isIntegral; 592 593 auto formatNumber(T)(T n) const 594 if (isFloatingPoint!T || isIntegral!T) 595 { 596 import tsv_numerics : formatNumber; 597 return formatNumber!T(n, floatPrecision); 598 } 599 } 600 601 /** A Summarizer object maintains the state of the summarization and performs basic 602 * processing. Handling of files and input lines is left to the caller. 603 * 604 * Classes supporting the Summarizer must implement the methods: 605 * - setOperators - Called after initializing the object for each operator to be processed. 606 * - processHeaderLine - Called to process the header line of each file. Returns true if 607 * it was the first header line processed (used when reading multiple files). 608 * - processNextLine - Called to process non-header lines. 609 * - writeSummaryHeader - Called to write the header line. 610 * - writeSummaryBody - Called to write the result lines. 611 * 612 */ 613 interface Summarizer(OutputRange) 614 { 615 /** Called after initializing the object for each operator to be processed. */ 616 void setOperators(InputRange!Operator op); 617 618 /** Called to process the header line of each file. Returns true if it was the 619 * first header line processed (used when reading multiple files). 620 */ 621 bool processHeaderLine(const char[][] lineFields); 622 623 /** Called to process non-header lines. */ 624 void processNextLine(const char[][] lineFields); 625 626 /** Called to write the header line. */ 627 void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions); 628 629 /** Called to write the result lines. */ 630 void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions); 631 } 632 633 /** SummarizerBase performs work shared by all sumarizers, most everything except for 634 * handling of unique keys. 635 * 636 * The base class handles creation, allocates storage for Operators and SharedFieldValues, 637 * and similar. Derived classes deal primarily with unique keys and the associated Calculators 638 * and UniqueKeyValuesLists. 639 */ 640 class SummarizerBase(OutputRange) : Summarizer!OutputRange 641 { 642 private char _inputFieldDelimiter; 643 private bool _hasProcessedFirstHeaderLine = false; 644 private SharedFieldValues _sharedFieldValues = null; // Null if no shared field value lists. 645 protected MissingFieldPolicy _missingPolicy; 646 protected DList!Operator _operators; 647 protected size_t _numOperators = 0; 648 649 this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 650 { 651 _inputFieldDelimiter = inputFieldDelimiter; 652 _missingPolicy = missingPolicy; 653 } 654 655 char inputFieldDelimiter() const @property 656 { 657 return _inputFieldDelimiter; 658 } 659 660 /** Sets the Operators used by the Summarizer. Called after construction. */ 661 void setOperators(InputRange!Operator operators) 662 { 663 foreach (op; operators) 664 { 665 _operators.insertBack(op); 666 _numOperators++; 667 auto numericFieldsToSave = op.numericFieldsToSave(); 668 auto textFieldsToSave = op.textFieldsToSave(); 669 670 if (numericFieldsToSave.length > 0 || textFieldsToSave.length > 0) 671 { 672 if (_sharedFieldValues is null) 673 { 674 _sharedFieldValues = new SharedFieldValues(); 675 } 676 numericFieldsToSave.each!(x => _sharedFieldValues.addNumericIndex(x)); 677 textFieldsToSave.each!(x => _sharedFieldValues.addTextIndex(x)); 678 } 679 } 680 } 681 682 /** Called to process the header line of each file. Returns true if it was the 683 * first header line processed (used when reading multiple files). 684 */ 685 bool processHeaderLine(const char[][] lineFields) 686 { 687 if (!_hasProcessedFirstHeaderLine) 688 { 689 _operators.each!(x => x.processHeaderLine(lineFields)); 690 _hasProcessedFirstHeaderLine = true; 691 return true; 692 } 693 else 694 { 695 return false; 696 } 697 } 698 699 protected final UniqueKeyValuesLists makeUniqueKeyValuesLists() 700 { 701 return (_sharedFieldValues is null) 702 ? null 703 : _sharedFieldValues.makeUniqueKeyValuesLists; 704 } 705 706 abstract void processNextLine(const char[][] lineFields); 707 abstract void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions); 708 abstract void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions); 709 } 710 711 /** The NoKeySummarizer is used when summarizing values across the entire input. 712 * 713 * Note: NoKeySummarizer is used in Operator unit tests and gets extensive testing 714 * through that mechanism. 715 */ 716 class NoKeySummarizer(OutputRange) : SummarizerBase!OutputRange 717 { 718 private Calculator[] _calculators; 719 private UniqueKeyValuesLists _valueLists; 720 721 this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 722 { 723 super(inputFieldDelimiter, missingPolicy); 724 } 725 726 /** Called after initializing the object for each operator to be processed. */ 727 override void setOperators(InputRange!Operator operators) 728 { 729 super.setOperators(operators); 730 731 /* Only one Calculator per Operation, so create them as Operators are added. */ 732 foreach (op; operators) _calculators ~= op.makeCalculator; 733 _valueLists = super.makeUniqueKeyValuesLists(); 734 } 735 736 /** Called to process non-header lines. */ 737 override void processNextLine(const char[][] lineFields) 738 { 739 _calculators.each!(x => x.processNextLine(lineFields)); 740 if (_valueLists !is null) _valueLists.processNextLine(lineFields, _missingPolicy); 741 } 742 743 /** Called to write the header line. */ 744 override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 745 { 746 put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter)); 747 put(outputStream, '\n'); 748 } 749 750 /** Called to write the result lines. */ 751 override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 752 { 753 put(outputStream, 754 _calculators[] 755 .map!(x => x.calculate(_valueLists, printOptions)) 756 .join(printOptions.fieldDelimiter)); 757 put(outputStream, '\n'); 758 } 759 } 760 761 /** KeySummarizerBase does work shared by the single key and multi-key summarizers. 762 * 763 * The primary difference between those two is the formation of the key. The primary 764 * reason for separating those into two separate classes is to simplify (speed-up) 765 * handling of single field keys, which are the most common use case. 766 */ 767 class KeySummarizerBase(OutputRange) : SummarizerBase!OutputRange 768 { 769 protected struct UniqueKeyData 770 { 771 Calculator[] calculators; 772 UniqueKeyValuesLists valuesLists; 773 } 774 775 private DList!string _uniqueKeys; 776 private UniqueKeyData[string] _uniqueKeyData; 777 778 this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 779 { 780 super(inputFieldDelimiter, missingPolicy); 781 } 782 783 protected void processNextLineWithKey(T : const char[])(T key, const char[][] lineFields) 784 { 785 debug writefln("[%s]: %s", __FUNCTION__, lineFields.to!string); 786 787 auto dataPtr = (key in _uniqueKeyData); 788 auto data = (dataPtr is null) ? addUniqueKey(key.to!string) : *dataPtr; 789 790 data.calculators.each!(x => x.processNextLine(lineFields)); 791 if (data.valuesLists !is null) data.valuesLists.processNextLine(lineFields, _missingPolicy); 792 } 793 794 protected UniqueKeyData addUniqueKey(string key) 795 { 796 assert(key !in _uniqueKeyData); 797 798 _uniqueKeys.insertBack(key); 799 800 auto calculators = new Calculator[_numOperators]; 801 size_t i = 0; 802 foreach (op; _operators) 803 { 804 calculators[i] = op.makeCalculator; 805 i++; 806 } 807 808 return _uniqueKeyData[key] = UniqueKeyData(calculators, super.makeUniqueKeyValuesLists()); 809 } 810 811 override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 812 { 813 put(outputStream, keyFieldHeader()); 814 put(outputStream, printOptions.fieldDelimiter); 815 put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter)); 816 put(outputStream, '\n'); 817 } 818 819 override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 820 { 821 foreach(key; _uniqueKeys) 822 { 823 auto data = _uniqueKeyData[key]; 824 put(outputStream, key); 825 put(outputStream, printOptions.fieldDelimiter); 826 put(outputStream, 827 data.calculators[] 828 .map!(x => x.calculate(data.valuesLists, printOptions)) 829 .join(printOptions.fieldDelimiter)); 830 put(outputStream, '\n'); 831 } 832 } 833 834 abstract string keyFieldHeader() const @property; 835 } 836 837 /** This Summarizer is for the case where the unique key is based on exactly one field. 838 */ 839 class OneKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange 840 { 841 private size_t _keyFieldIndex = 0; 842 private string _keyFieldHeader; 843 private DList!string _uniqueKeys; 844 845 this(size_t keyFieldIndex, char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 846 { 847 super(inputFieldDelimiter, missingPolicy); 848 _keyFieldIndex = keyFieldIndex; 849 _keyFieldHeader = fieldHeaderFromIndex(keyFieldIndex); 850 } 851 852 override string keyFieldHeader() const @property 853 { 854 return _keyFieldHeader; 855 } 856 857 override bool processHeaderLine(const char[][] lineFields) 858 { 859 assert(_keyFieldIndex <= lineFields.length); 860 861 bool isFirstHeaderLine = super.processHeaderLine(lineFields); 862 if (isFirstHeaderLine) 863 { 864 _keyFieldHeader = lineFields[_keyFieldIndex].to!string; 865 } 866 return isFirstHeaderLine; 867 } 868 869 override void processNextLine(const char[][] lineFields) 870 { 871 assert(_keyFieldIndex < lineFields.length); 872 processNextLineWithKey(lineFields[_keyFieldIndex], lineFields); 873 } 874 } 875 876 /** This Summarizer is for the case where the unique key is based on multiple fields. 877 */ 878 class MultiKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange 879 { 880 private size_t[] _keyFieldIndices; 881 private string _keyFieldHeader; 882 private DList!string _uniqueKeys; 883 884 this(const size_t[] keyFieldIndices, char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 885 { 886 super(inputFieldDelimiter, missingPolicy); 887 _keyFieldIndices = keyFieldIndices.dup; 888 _keyFieldHeader = 889 _keyFieldIndices.map!(i => fieldHeaderFromIndex(i)) 890 .join(inputFieldDelimiter); 891 } 892 893 override string keyFieldHeader() const @property 894 { 895 return _keyFieldHeader; 896 } 897 898 override bool processHeaderLine(const char[][] lineFields) 899 { 900 assert(_keyFieldIndices.all!(x => x < lineFields.length)); 901 assert(_keyFieldIndices.length >= 2); 902 903 bool isFirstHeaderLine = super.processHeaderLine(lineFields); 904 if (isFirstHeaderLine) 905 { 906 _keyFieldHeader = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string; 907 } 908 return isFirstHeaderLine; 909 } 910 911 override void processNextLine(const char[][] lineFields) 912 { 913 assert(_keyFieldIndices.all!(x => x < lineFields.length)); 914 assert(_keyFieldIndices.length >= 2); 915 916 string key = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string; 917 processNextLineWithKey(key, lineFields); 918 } 919 } 920 921 version(unittest) 922 { 923 /* testSummarizer is a helper that can run many types of unit tests against 924 * Summarizers. It can also test operators, but there are separate helper functions 925 * better suited for that purpose. 926 * 927 * Arguments are a command line args, an input file, and expected output. The 928 * input file and expected output are already split into lines and fields, the helper 929 * manages re-assembly. The program name from the command line args is printed if an 930 * an error occurs, it is useful to identify the test that failed. 931 * 932 * Note: Much of this is a duplication tsvSummarize logic. Better abstraction of 933 * file input/output would enable running unit tests directly on top of tsvSummarize. 934 */ 935 void testSummarizer(string[] cmdArgs, string[][] file, string[][] expected) 936 { 937 import std.array : appender; 938 939 assert(cmdArgs.length > 0, "[testSummarizer] cmdArgs must not be empty."); 940 941 auto formatAssertMessage(T...)(string msg, T formatArgs) 942 { 943 auto formatString = "[testSummarizer] %s: " ~ msg; 944 return format(formatString, cmdArgs[0], formatArgs); 945 } 946 947 TsvSummarizeOptions cmdopt; 948 auto savedCmdArgs = cmdArgs.to!string; 949 auto r = cmdopt.processArgs(cmdArgs); 950 assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); 951 952 assert(file.all!(line => line.length >= cmdopt.endFieldIndex), 953 formatAssertMessage("group-by or operator field number greater than number of fields a line of the input file.")); 954 955 /* Pick the Summarizer based on the number of key-fields entered. */ 956 auto summarizer = 957 (cmdopt.keyFields.length == 0) 958 ? new NoKeySummarizer!(typeof(appender!(char[])()))( 959 cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 960 961 : (cmdopt.keyFields.length == 1) 962 ? new OneKeySummarizer!(typeof(appender!(char[])()))( 963 cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 964 965 : new MultiKeySummarizer!(typeof(appender!(char[])()))( 966 cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy); 967 968 /* Add the operators to the Summarizer. */ 969 summarizer.setOperators(inputRangeObject(cmdopt.operators[])); 970 971 /* Process the file one line at a time. */ 972 auto lineFields = new char[][](cmdopt.endFieldIndex); 973 bool headerFound = false; 974 foreach (lineNum, line; file.enumerate(1)) 975 { 976 /* Copy the needed fields to the fields array. */ 977 foreach (i, val; line[0..cmdopt.endFieldIndex]) lineFields[i] = val.dup; 978 979 if (cmdopt.hasHeader && lineNum == 1) 980 { 981 if (!headerFound) 982 { 983 summarizer.processHeaderLine(lineFields); 984 headerFound = true; 985 } 986 } 987 else 988 { 989 try summarizer.processNextLine(lineFields); 990 catch (Exception exc) 991 { 992 assert(false, formatAssertMessage(exc.msg)); 993 } 994 } 995 } 996 auto printOptions = SummarizerPrintOptions( 997 cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision); 998 999 auto summarizerOutput = appender!(char[])(); 1000 1001 if (cmdopt.hasHeader || cmdopt.writeHeader) 1002 { 1003 summarizer.writeSummaryHeader(summarizerOutput, printOptions); 1004 } 1005 1006 summarizer.writeSummaryBody(summarizerOutput, printOptions); 1007 auto expectedOutput = expected.map!(x => x.joiner(cmdopt.inputFieldDelimiter.to!string)).joiner("\n").to!string; 1008 if (expectedOutput.length > 0 && expectedOutput[$-1] != '\n') expectedOutput ~= "\n"; 1009 1010 assert(summarizerOutput.data == expectedOutput, 1011 formatAssertMessage( 1012 "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================", 1013 expectedOutput.to!string, summarizerOutput.data.to!string)); 1014 } 1015 } 1016 1017 unittest 1018 { 1019 /* Summarizer unit tests. Primarily single-key and multi-key summarizers. To a limited 1020 * extent, command line option handling (TsvSummarizeOptions). Individual operators 1021 * have separate tests, those tests test the no-key summarizer. The Values operator is 1022 * used in these tests. It engages a number of behaviors, and the results have limited 1023 * ambiguity. Using only one operator limits dependence on individual operators. 1024 */ 1025 1026 auto file1 = [["fld1", "fld2", "fld3"], 1027 ["a", "a", "3"], 1028 ["c", "a", "2b"], 1029 ["c", "bc", ""], 1030 ["a", "c", "2b"], 1031 ["", "bc", ""], 1032 ["c", "bc", "3"]]; 1033 1034 /* Single-key summarizer tests. 1035 */ 1036 testSummarizer(["unittest-sk-1", "--header", "--group-by", "1", "--values", "1"], 1037 file1, 1038 [["fld1", "fld1_values"], 1039 ["a", "a|a"], 1040 ["c", "c|c|c"], 1041 ["", ""]] 1042 ); 1043 testSummarizer(["unittest-sk-2", "-H", "--group-by", "1", "--values", "2"], 1044 file1, 1045 [["fld1", "fld2_values"], 1046 ["a", "a|c"], 1047 ["c", "a|bc|bc"], 1048 ["", "bc"]] 1049 ); 1050 testSummarizer(["unittest-sk-3", "-H", "-g", "1", "--values", "3"], 1051 file1, 1052 [["fld1", "fld3_values"], 1053 ["a", "3|2b"], 1054 ["c", "2b||3"], 1055 ["", ""]] 1056 ); 1057 testSummarizer(["unittest-sk-4", "-H", "--group-by", "1", "--values", "1,2,3"], 1058 file1, 1059 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1060 ["a", "a|a", "a|c", "3|2b"], 1061 ["c", "c|c|c", "a|bc|bc", "2b||3"], 1062 ["", "", "bc", ""]] 1063 ); 1064 testSummarizer(["unittest-sk-5", "-H", "--group-by", "1", "--values", "1-3"], 1065 file1, 1066 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1067 ["a", "a|a", "a|c", "3|2b"], 1068 ["c", "c|c|c", "a|bc|bc", "2b||3"], 1069 ["", "", "bc", ""]] 1070 ); 1071 testSummarizer(["unittest-sk-6", "-H", "--group-by", "1", "--values", "3,2,1"], 1072 file1, 1073 [["fld1", "fld3_values", "fld2_values", "fld1_values"], 1074 ["a", "3|2b", "a|c", "a|a"], 1075 ["c", "2b||3", "a|bc|bc", "c|c|c"], 1076 ["", "", "bc", ""]] 1077 ); 1078 testSummarizer(["unittest-sk-7", "-H", "--group-by", "1", "--values", "3-1"], 1079 file1, 1080 [["fld1", "fld3_values", "fld2_values", "fld1_values"], 1081 ["a", "3|2b", "a|c", "a|a"], 1082 ["c", "2b||3", "a|bc|bc", "c|c|c"], 1083 ["", "", "bc", ""]] 1084 ); 1085 testSummarizer(["unittest-sk-8", "-H", "--group-by", "2", "--values", "1"], 1086 file1, 1087 [["fld2", "fld1_values"], 1088 ["a", "a|c"], 1089 ["bc", "c||c"], 1090 ["c", "a"]] 1091 ); 1092 testSummarizer(["unittest-sk-9", "-H", "--group-by", "2", "--values", "2"], 1093 file1, 1094 [["fld2", "fld2_values"], 1095 ["a", "a|a"], 1096 ["bc", "bc|bc|bc"], 1097 ["c", "c"]] 1098 ); 1099 testSummarizer(["unittest-sk-10", "-H", "--group-by", "2", "--values", "3"], 1100 file1, 1101 [["fld2", "fld3_values"], 1102 ["a", "3|2b"], 1103 ["bc", "||3"], 1104 ["c", "2b"]] 1105 ); 1106 testSummarizer(["unittest-sk-11", "-H", "--group-by", "2", "--values", "1,3"], 1107 file1, 1108 [["fld2", "fld1_values", "fld3_values"], 1109 ["a", "a|c", "3|2b"], 1110 ["bc", "c||c", "||3"], 1111 ["c", "a", "2b"]] 1112 ); 1113 testSummarizer(["unittest-sk-12", "-H", "--group-by", "2", "--values", "3,1"], 1114 file1, 1115 [["fld2", "fld3_values", "fld1_values"], 1116 ["a", "3|2b", "a|c"], 1117 ["bc", "||3", "c||c"], 1118 ["c", "2b", "a"]] 1119 ); 1120 testSummarizer(["unittest-sk-13", "-H", "--group-by", "3", "--values", "1"], 1121 file1, 1122 [["fld3", "fld1_values"], 1123 ["3", "a|c"], 1124 ["2b", "c|a"], 1125 ["", "c|"]] 1126 ); 1127 testSummarizer(["unittest-sk-14", "-H", "--group-by", "3", "--values", "2"], 1128 file1, 1129 [["fld3", "fld2_values"], 1130 ["3", "a|bc"], 1131 ["2b", "a|c"], 1132 ["", "bc|bc"]] 1133 ); 1134 testSummarizer(["unittest-sk-15", "-H", "--group-by", "3", "--values", "1,2"], 1135 file1, 1136 [["fld3", "fld1_values", "fld2_values"], 1137 ["3", "a|c", "a|bc"], 1138 ["2b", "c|a", "a|c"], 1139 ["", "c|", "bc|bc"]] 1140 ); 1141 1142 /* Multi-key summarizer tests. 1143 */ 1144 testSummarizer(["unittest-mk-1", "--header", "--group-by", "1,2", "--values", "1"], 1145 file1, 1146 [["fld1", "fld2", "fld1_values"], 1147 ["a", "a", "a"], 1148 ["c", "a", "c"], 1149 ["c", "bc", "c|c"], 1150 ["a", "c", "a"], 1151 ["", "bc", ""]] 1152 ); 1153 testSummarizer(["unittest-mk-2", "-H", "--group-by", "1,2", "--values", "2"], 1154 file1, 1155 [["fld1", "fld2", "fld2_values"], 1156 ["a", "a", "a"], 1157 ["c", "a", "a"], 1158 ["c", "bc", "bc|bc"], 1159 ["a", "c", "c"], 1160 ["", "bc", "bc"]] 1161 ); 1162 testSummarizer(["unittest-mk-3", "-H", "--group-by", "1,2", "--values", "3"], 1163 file1, 1164 [["fld1", "fld2", "fld3_values"], 1165 ["a", "a", "3"], 1166 ["c", "a", "2b"], 1167 ["c", "bc", "|3"], 1168 ["a", "c", "2b"], 1169 ["", "bc", ""]] 1170 ); 1171 testSummarizer(["unittest-mk-4", "-H", "--group-by", "1,2", "--values", "3,1"], 1172 file1, 1173 [["fld1", "fld2", "fld3_values", "fld1_values"], 1174 ["a", "a", "3", "a"], 1175 ["c", "a", "2b", "c"], 1176 ["c", "bc", "|3", "c|c"], 1177 ["a", "c", "2b", "a"], 1178 ["", "bc", "", ""]] 1179 ); 1180 testSummarizer(["unittest-mk-5", "-H", "--group-by", "3,2", "--values", "1"], 1181 file1, 1182 [["fld3", "fld2", "fld1_values"], 1183 ["3", "a", "a"], 1184 ["2b", "a", "c"], 1185 ["", "bc", "c|"], 1186 ["2b", "c", "a"], 1187 ["3", "bc", "c"]] 1188 ); 1189 testSummarizer(["unittest-mk-6", "-H", "--group-by", "3-2", "--values", "1"], 1190 file1, 1191 [["fld3", "fld2", "fld1_values"], 1192 ["3", "a", "a"], 1193 ["2b", "a", "c"], 1194 ["", "bc", "c|"], 1195 ["2b", "c", "a"], 1196 ["3", "bc", "c"]] 1197 ); 1198 testSummarizer(["unittest-mk-7", "-H", "--group-by", "2,1,3", "--values", "2"], 1199 file1, 1200 [["fld2", "fld1", "fld3", "fld2_values"], 1201 ["a", "a", "3", "a"], 1202 ["a", "c", "2b", "a"], 1203 ["bc", "c", "", "bc"], 1204 ["c", "a", "2b", "c"], 1205 ["bc", "", "", "bc"], 1206 ["bc", "c", "3", "bc"]] 1207 ); 1208 1209 /* Missing policies. */ 1210 testSummarizer(["unittest-mis-1", "--header", "--group-by", "1", "--values", "1", "--exclude-missing"], 1211 file1, 1212 [["fld1", "fld1_values"], 1213 ["a", "a|a"], 1214 ["c", "c|c|c"], 1215 ["", ""]] 1216 ); 1217 testSummarizer(["unittest-mis-2", "-H", "--group-by", "1", "--values", "2", "-x"], 1218 file1, 1219 [["fld1", "fld2_values"], 1220 ["a", "a|c"], 1221 ["c", "a|bc|bc"], 1222 ["", "bc"]] 1223 ); 1224 testSummarizer(["unittest-mis-3", "-H", "-g", "1", "--values", "3", "-x"], 1225 file1, 1226 [["fld1", "fld3_values"], 1227 ["a", "3|2b"], 1228 ["c", "2b|3"], 1229 ["", ""]] 1230 ); 1231 testSummarizer(["unittest-mis-4", "-H", "--group-by", "1", "--values", "1,2,3", "-x"], 1232 file1, 1233 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1234 ["a", "a|a", "a|c", "3|2b"], 1235 ["c", "c|c|c", "a|bc|bc", "2b|3"], 1236 ["", "", "bc", ""]] 1237 ); 1238 testSummarizer(["unittest-mis-5", "--header", "--group-by", "1", "--values", "1", "--replace-missing", "NA"], 1239 file1, 1240 [["fld1", "fld1_values"], 1241 ["a", "a|a"], 1242 ["c", "c|c|c"], 1243 ["", "NA"]] 1244 ); 1245 testSummarizer(["unittest-mis-6", "-H", "--group-by", "1", "--values", "2", "-r", "NA"], 1246 file1, 1247 [["fld1", "fld2_values"], 1248 ["a", "a|c"], 1249 ["c", "a|bc|bc"], 1250 ["", "bc"]] 1251 ); 1252 testSummarizer(["unittest-mis-7", "-H", "-g", "1", "--values", "3", "-r", "NA"], 1253 file1, 1254 [["fld1", "fld3_values"], 1255 ["a", "3|2b"], 1256 ["c", "2b|NA|3"], 1257 ["", "NA"]] 1258 ); 1259 testSummarizer(["unittest-mis-8", "-H", "--group-by", "1", "--values", "1,2,3", "-r", "NA"], 1260 file1, 1261 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1262 ["a", "a|a", "a|c", "3|2b"], 1263 ["c", "c|c|c", "a|bc|bc", "2b|NA|3"], 1264 ["", "NA", "bc", "NA"]] 1265 ); 1266 testSummarizer(["unittest-mis-9", "-H", "--group-by", "1,2", "--values", "3,1", "-x"], 1267 file1, 1268 [["fld1", "fld2", "fld3_values", "fld1_values"], 1269 ["a", "a", "3", "a"], 1270 ["c", "a", "2b", "c"], 1271 ["c", "bc", "3", "c|c"], 1272 ["a", "c", "2b", "a"], 1273 ["", "bc", "", ""]] 1274 ); 1275 testSummarizer(["unittest-mis-10", "-H", "--group-by", "3,2", "--values", "1", "-x"], 1276 file1, 1277 [["fld3", "fld2", "fld1_values"], 1278 ["3", "a", "a"], 1279 ["2b", "a", "c"], 1280 ["", "bc", "c"], 1281 ["2b", "c", "a"], 1282 ["3", "bc", "c"]] 1283 ); 1284 testSummarizer(["unittest-mis-11", "-H", "--group-by", "2,1,3", "--values", "2", "-x"], 1285 file1, 1286 [["fld2", "fld1", "fld3", "fld2_values"], 1287 ["a", "a", "3", "a"], 1288 ["a", "c", "2b", "a"], 1289 ["bc", "c", "", "bc"], 1290 ["c", "a", "2b", "c"], 1291 ["bc", "", "", "bc"], 1292 ["bc", "c", "3", "bc"]] 1293 ); 1294 testSummarizer(["unittest-mis-12", "-H", "--group-by", "1,2", "--values", "3,1", "-r", "NA"], 1295 file1, 1296 [["fld1", "fld2", "fld3_values", "fld1_values"], 1297 ["a", "a", "3", "a"], 1298 ["c", "a", "2b", "c"], 1299 ["c", "bc", "NA|3", "c|c"], 1300 ["a", "c", "2b", "a"], 1301 ["", "bc", "NA", "NA"]] 1302 ); 1303 testSummarizer(["unittest-mis-13", "-H", "--group-by", "3,2", "--values", "1", "-r", "NA"], 1304 file1, 1305 [["fld3", "fld2", "fld1_values"], 1306 ["3", "a", "a"], 1307 ["2b", "a", "c"], 1308 ["", "bc", "c|NA"], 1309 ["2b", "c", "a"], 1310 ["3", "bc", "c"]] 1311 ); 1312 testSummarizer(["unittest-mis-14", "-H", "--group-by", "2,1,3", "--values", "2", "-r", "NA"], 1313 file1, 1314 [["fld2", "fld1", "fld3", "fld2_values"], 1315 ["a", "a", "3", "a"], 1316 ["a", "c", "2b", "a"], 1317 ["bc", "c", "", "bc"], 1318 ["c", "a", "2b", "c"], 1319 ["bc", "", "", "bc"], 1320 ["bc", "c", "3", "bc"]] 1321 ); 1322 1323 /* Validate that the no-key summarizer works with testSummarizer helper function. 1324 */ 1325 testSummarizer(["unittest-nk-1", "-H", "--values", "1,2"], 1326 file1, 1327 [["fld1_values", "fld2_values"], 1328 ["a|c|c|a||c", "a|a|bc|c|bc|bc"]] 1329 ); 1330 1331 /* Header variations: no header line; auto-generated header line; custom headers. 1332 */ 1333 testSummarizer(["unittest-hdr-1", "--group-by", "1", "--values", "1"], 1334 file1[1..$], 1335 [["a", "a|a"], 1336 ["c", "c|c|c"], 1337 ["", ""]] 1338 ); 1339 testSummarizer(["unittest-hdr-2", "--group-by", "1,2", "--values", "2"], 1340 file1[1..$], 1341 [["a", "a", "a"], 1342 ["c", "a", "a"], 1343 ["c", "bc", "bc|bc"], 1344 ["a", "c", "c"], 1345 ["", "bc", "bc"]] 1346 ); 1347 testSummarizer(["unittest-hdr-3", "--write-header", "--group-by", "2", "--values", "1"], 1348 file1[1..$], 1349 [["field2", "field1_values"], 1350 ["a", "a|c"], 1351 ["bc", "c||c"], 1352 ["c", "a"]] 1353 ); 1354 testSummarizer(["unittest-hdr-4", "-w", "--group-by", "3,2", "--values", "1"], 1355 file1[1..$], 1356 [["field3", "field2", "field1_values"], 1357 ["3", "a", "a"], 1358 ["2b", "a", "c"], 1359 ["", "bc", "c|"], 1360 ["2b", "c", "a"], 1361 ["3", "bc", "c"]] 1362 ); 1363 testSummarizer(["unittest-hdr-5", "-H", "--group-by", "2", "--values", "3:Field3Values"], 1364 file1, 1365 [["fld2", "Field3Values"], 1366 ["a", "3|2b"], 1367 ["bc", "||3"], 1368 ["c", "2b"]] 1369 ); 1370 testSummarizer(["unittest-hdr-6", "-H", "--group-by", "1,2", "--values", "3:FieldThreeValues", "--values", "1:FieldOneValues"], 1371 file1, 1372 [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"], 1373 ["a", "a", "3", "a"], 1374 ["c", "a", "2b", "c"], 1375 ["c", "bc", "|3", "c|c"], 1376 ["a", "c", "2b", "a"], 1377 ["", "bc", "", ""]] 1378 ); 1379 testSummarizer(["unittest-hdr-7", "--write-header", "--group-by", "1", "--values", "3:f3_vals","--values", "2:f2_vals", "--values", "1:f1_vals"], 1380 file1[1..$], 1381 [["field1", "f3_vals", "f2_vals", "f1_vals"], 1382 ["a", "3|2b", "a|c", "a|a"], 1383 ["c", "2b||3", "a|bc|bc", "c|c|c"], 1384 ["", "", "bc", ""]] 1385 ); 1386 testSummarizer(["unittest-hdr-8", "--write-header", "--group-by", "1,3,2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2"], 1387 file1[1..$], 1388 [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"], 1389 ["a", "3", "a", "3", "a", "a"], 1390 ["c", "2b", "a", "2b", "c", "a"], 1391 ["c", "", "bc", "", "c", "bc"], 1392 ["a", "2b", "c", "2b", "a", "c"], 1393 ["", "", "bc", "", "", "bc"], 1394 ["c", "3", "bc", "3", "c", "bc"]] 1395 ); 1396 testSummarizer(["unittest-hdr-9", "--write-header", "--group-by", "1,3-2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2"], 1397 file1[1..$], 1398 [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"], 1399 ["a", "3", "a", "3", "a", "a"], 1400 ["c", "2b", "a", "2b", "c", "a"], 1401 ["c", "", "bc", "", "c", "bc"], 1402 ["a", "2b", "c", "2b", "a", "c"], 1403 ["", "", "bc", "", "", "bc"], 1404 ["c", "3", "bc", "3", "c", "bc"]] 1405 ); 1406 1407 /* Alternate file widths and lengths. 1408 */ 1409 1410 auto file3x2 = [["fld1", "fld2", "fld3"], 1411 ["a", "b", "c"], 1412 ["c", "b", "a"]]; 1413 1414 testSummarizer(["unittest-3x2-1", "-H", "--group-by", "1", "--values", "3"], 1415 file3x2, 1416 [["fld1", "fld3_values"], 1417 ["a", "c"], 1418 ["c", "a"]] 1419 ); 1420 testSummarizer(["unittest-3x2-2", "-H", "--group-by", "2", "--values", "3"], 1421 file3x2, 1422 [["fld2", "fld3_values"], 1423 ["b", "c|a"]] 1424 ); 1425 testSummarizer(["unittest-3x2-3", "-H", "--group-by", "2,1", "--values", "3"], 1426 file3x2, 1427 [["fld2", "fld1", "fld3_values"], 1428 ["b", "a", "c"], 1429 ["b", "c", "a"]] 1430 ); 1431 1432 auto file3x1 = [["fld1", "fld2", "fld3"], 1433 ["a", "b", "c"]]; 1434 1435 testSummarizer(["unittest-3x1-1", "-H", "--group-by", "1", "--values", "3"], 1436 file3x1, 1437 [["fld1", "fld3_values"], 1438 ["a", "c"]] 1439 ); 1440 testSummarizer(["unittest-3x1-2", "--group-by", "1", "--values", "3"], 1441 file3x1[1..$], 1442 [["a", "c"]] 1443 ); 1444 testSummarizer(["unittest-3x1-3", "-H", "--group-by", "2,1", "--values", "3"], 1445 file3x1, 1446 [["fld2", "fld1", "fld3_values"], 1447 ["b", "a", "c"]] 1448 ); 1449 testSummarizer(["unittest-3x1-4", "--group-by", "2,1", "--values", "3"], 1450 file3x1[1..$], 1451 [["b", "a", "c"]] 1452 ); 1453 1454 auto file3x0 = [["fld1", "fld2", "fld3"]]; 1455 1456 testSummarizer(["unittest-3x0-1", "-H", "--group-by", "1", "--values", "3"], 1457 file3x0, 1458 [["fld1", "fld3_values"]] 1459 ); 1460 testSummarizer(["unittest-3x0-2", "--group-by", "1", "--values", "3"], 1461 file3x0[1..$], 1462 [] 1463 ); 1464 testSummarizer(["unittest-3x0-3", "--write-header", "--group-by", "1", "--values", "3"], 1465 file3x0[1..$], 1466 [["field1", "field3_values"]] 1467 ); 1468 1469 1470 testSummarizer(["unittest-3x0-4", "-H", "--group-by", "2,1", "--values", "3"], 1471 file3x0, 1472 [["fld2", "fld1", "fld3_values"]] 1473 ); 1474 1475 testSummarizer(["unittest-3x0-5", "--group-by", "2,1", "--values", "3"], 1476 file3x0[1..$], 1477 [] 1478 ); 1479 1480 testSummarizer(["unittest-3x0-6", "--write-header", "--group-by", "2,1", "--values", "3"], 1481 file3x0[1..$], 1482 [["field2", "field1", "field3_values"]] 1483 ); 1484 1485 auto file2x1 = [["fld1", "fld2"], 1486 ["a", "b"]]; 1487 1488 testSummarizer(["unittest-2x1-1", "-H", "--group-by", "1", "--values", "2"], 1489 file2x1, 1490 [["fld1", "fld2_values"], 1491 ["a", "b"]] 1492 ); 1493 testSummarizer(["unittest-2x1-2", "-H", "--group-by", "2,1", "--values", "1"], 1494 file2x1, 1495 [["fld2", "fld1", "fld1_values"], 1496 ["b", "a", "a"]] 1497 ); 1498 1499 auto file2x0 = [["fld1", "fld2"]]; 1500 1501 testSummarizer(["unittest-2x0-1", "-H", "--group-by", "1", "--values", "2"], 1502 file2x0, 1503 [["fld1", "fld2_values"]] 1504 ); 1505 testSummarizer(["unittest-2x0-2", "-H", "--group-by", "2,1", "--values", "1"], 1506 file2x0, 1507 [["fld2", "fld1", "fld1_values"]] 1508 ); 1509 1510 auto file1x2 = [["fld1"], 1511 ["a"], 1512 [""]]; 1513 1514 testSummarizer(["unittest-1x2-1", "-H", "--group-by", "1", "--values", "1"], 1515 file1x2, 1516 [["fld1", "fld1_values"], 1517 ["a", "a"], 1518 ["", ""]] 1519 ); 1520 1521 auto file1x2b = [["fld1"], 1522 [""], 1523 [""]]; 1524 1525 testSummarizer(["unittest-1x2b-2", "-H", "--group-by", "1", "--values", "1"], 1526 file1x2b, 1527 [["fld1", "fld1_values"], 1528 ["", "|"]] 1529 ); 1530 1531 auto file1x1 = [["fld1"], 1532 ["x"]]; 1533 1534 testSummarizer(["unittest-1x1-1", "-H", "--group-by", "1", "--values", "1"], 1535 file1x1, 1536 [["fld1", "fld1_values"], 1537 ["x", "x"]] 1538 ); 1539 1540 testSummarizer(["unittest-1x1-2", "--group-by", "1", "--values", "1"], 1541 file1x1[1..$], 1542 [["x", "x"]] 1543 ); 1544 1545 testSummarizer(["unittest-1x1-3", "--write-header", "--group-by", "1", "--values", "1"], 1546 file1x1[1..$], 1547 [["field1", "field1_values"], 1548 ["x", "x"]] 1549 ); 1550 1551 auto file1x1b = [["fld1"], 1552 [""]]; 1553 1554 testSummarizer(["unittest-1x1b-1", "-H", "--group-by", "1", "--values", "1"], 1555 file1x1b, 1556 [["fld1", "fld1_values"], 1557 ["", ""]] 1558 ); 1559 1560 auto file1x0 = [["fld1"]]; 1561 1562 testSummarizer(["unittest-1x0-1", "-H", "--group-by", "1", "--values", "1"], 1563 file1x0, 1564 [["fld1", "fld1_values"]] 1565 ); 1566 1567 testSummarizer(["unittest-1x0-2", "--group-by", "1", "--values", "1"], 1568 file1x0[1..$], 1569 [] 1570 ); 1571 1572 testSummarizer(["unittest-1x0-3", "--write-header", "--group-by", "1", "--values", "1"], 1573 file1x0[1..$], 1574 [["field1", "field1_values"]] 1575 ); 1576 1577 /* Alternate delimiters. */ 1578 testSummarizer(["unittest-delim-1", "-H", "--values", "1,2", "--delimiter", "%"], 1579 file1, 1580 [["fld1_values", "fld2_values"], 1581 ["a|c|c|a||c", "a|a|bc|c|bc|bc"]] 1582 ); 1583 testSummarizer(["unittest-delim-2", "-H", "--values", "1-2", "--values-delimiter", "$"], 1584 file1, 1585 [["fld1_values", "fld2_values"], 1586 ["a$c$c$a$$c", "a$a$bc$c$bc$bc"]] 1587 ); 1588 testSummarizer(["unittest-delim-3", "-H", "--values", "1,2", "--delimiter", "#", "--values-delimiter", ","], 1589 file1, 1590 [["fld1_values", "fld2_values"], 1591 ["a,c,c,a,,c", "a,a,bc,c,bc,bc"]] 1592 ); 1593 testSummarizer(["unittest-delim-4", "--write-header", "--group-by", "2", "--values", "1", 1594 "--delimiter", "^", "--values-delimiter", ":"], 1595 file1[1..$], 1596 [["field2", "field1_values"], 1597 ["a", "a:c"], 1598 ["bc", "c::c"], 1599 ["c", "a"]] 1600 ); 1601 testSummarizer(["unittest-delim-5", "--group-by", "1,2", "--values", "2", "--delimiter", "/", 1602 "--values-delimiter", "\\"], 1603 file1[1..$], 1604 [["a", "a", "a"], 1605 ["c", "a", "a"], 1606 ["c", "bc", "bc\\bc"], 1607 ["a", "c", "c"], 1608 ["", "bc", "bc"]] 1609 ); 1610 } 1611 1612 /* Summary Operators and Calculators 1613 * 1614 * Two types of objects are used in implementation: Operators and Calculators. An Operator 1615 * represents a summary calculation specified on the command line, e.g. '--mean 5'. A 1616 * Calculator is used to manage the summary calculation for each unique key in the input. 1617 * 1618 * As an example, consider the command: 1619 * 1620 * $tsv-summarize --group-by 1 --mean 3 --mean 5 1621 * 1622 * This command will create two instances of a MeanOperator, one each for fields 3 and 5. 1623 * They produce the output field headers (e.g. "field3_mean", "field5_mean"). They also 1624 * create MeanCalculator objects for each unique value in field 1. For 'mean', a 1625 * calculator needs to track occurrence count and sum. Calculators produce the final 1626 * value when all processing is finished. 1627 * 1628 * Summary field headers 1629 * 1630 * There are several options for specifying summary field headers. The defaults combine the 1631 * operator name and the header of the field summarized. The defaults can be overridden on 1632 * on the command line. These scenarios are supported via the operator constructor and the 1633 * processHeaderLine() method. 1634 * 1635 * Missing field policy 1636 * 1637 * At present, tsv-summarize has a single policy for handling missing values that applies 1638 * to all operators. However, it is logically operator specific and is implemented that 1639 * way. The MissingFieldPolicy struct describes the policy, each operator contains one. 1640 * Calculators access thier operator's policy struct. 1641 */ 1642 1643 /** An Operator represents a summary calculation specified on the command line. 1644 * e.g. '--mean 5'. 1645 */ 1646 interface Operator 1647 { 1648 @property string header(); 1649 @property string name(); 1650 void processHeaderLine(const char[][] fields); 1651 size_t[] numericFieldsToSave(); // Numeric fields this Operator needs saved 1652 size_t[] textFieldsToSave(); // Text fields this Operator needs saved 1653 Calculator makeCalculator(); 1654 } 1655 1656 /** Calculators are responsible for the calculation of a single computation. They 1657 * process each line and produce the final value when all processing is finished. 1658 */ 1659 interface Calculator 1660 { 1661 void processNextLine(const char[][] fields); 1662 string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions); 1663 } 1664 1665 /** This class describes processing behavior when a missing value is encountered. 1666 */ 1667 class MissingFieldPolicy 1668 { 1669 private bool _useMissing = true; // True if missing values are processed unchanged. 1670 private bool _replaceMissing = false; // True if missing values are replaced. 1671 private string _missingReplacement; // Replacement string if replaceMissing is true. 1672 1673 this (in bool excludeMissing = false, in string missingReplacement = "") 1674 { 1675 updatePolicy(excludeMissing, missingReplacement); 1676 } 1677 1678 void updatePolicy(in bool excludeMissing, in string missingReplacement) 1679 { 1680 _missingReplacement = missingReplacement; 1681 _replaceMissing = missingReplacement.length != 0; 1682 _useMissing = !excludeMissing && !replaceMissing; 1683 } 1684 1685 final bool isMissingField(const char[] field) const 1686 { 1687 return field.length == 0; 1688 } 1689 1690 final bool useMissing() const @property 1691 { 1692 return _useMissing; 1693 } 1694 1695 final bool excludeMissing() const @property 1696 { 1697 return !_useMissing && !_replaceMissing; 1698 } 1699 1700 final bool replaceMissing() const @property 1701 { 1702 return _replaceMissing; 1703 } 1704 1705 final string missingReplacement() const @property 1706 { 1707 return _missingReplacement; 1708 } 1709 } 1710 1711 /* The SharedFieldValues and UniqueKeyValuesLists classes manage lists of values collected 1712 * while reading data. Operations like median collect all values and operate on them when 1713 * running the final calculation. Value lists are needed for each unique key. A command 1714 * using multiple Operators may save multiple fields. And, different Operators may be run 1715 * against the same field. 1716 * 1717 * The last part motivates these classes. Handling large data sets necessitates minimizing 1718 * in-memory storage, making it desirable to share identical lists between Calculators. 1719 * Otherwise, each Calculator could implement its own storage, which would be simpler. 1720 * 1721 * The setup works as follows: 1722 * - Operators advertise fields they need saved ([text|numeric]FieldsToSave methods). 1723 * - The SummarizerBase object keeps a SharedFieldValues object, which in turn keeps list 1724 * of the fields advertised by Operators as needing sharing. This list gets created 1725 * during command initialization (SummarizerBase.setOperators). 1726 * - The SharedFieldValues object is used to create a UniqueKeyValuesLists object every 1727 * time a new unique key is found, in parellel to the Calculator objects created for the 1728 * key. The UniqueKeyValuesLists objects are managed by derived Summarizer classes. 1729 * - A unique key's UniqueKeyValuesLists object is passed each input line, same as 1730 * Calculators, saving the values. 1731 * - Calculators retrieve the saved values during the calculation phase. The calculator's 1732 * ProcessNextField method is typically a no-op. 1733 * - Calculators cannot make assumptions about the order of the saved values. This is 1734 * pragmatic concession to median and quantile calculations, which need to sort the data, 1735 * at least partially. Rather than generate sorted copies, the current algorithms 1736 * sort the data in place. 1737 * 1738 * One concession to duplicate storage is that text and numeric versions of the same 1739 * field might be stored. The reason is because it's important to convert text to numbers 1740 * as they are read so that useful error messages can be generated. And, storing both 1741 * forms of the same field should be less common. 1742 * 1743 * The current implementation uses the same missing values policy for all fields. If 1744 * multiple policies become supported this will need to change. 1745 * 1746 * Built-in calculations - UniqueKeyValueLists have a built-in median operation. This is 1747 * to avoid repeated calculations of the median by different calculations. 1748 */ 1749 1750 class SharedFieldValues 1751 { 1752 // Arrays with field indices that need to be saved. 1753 private size_t[] _numericFieldIndices; 1754 private size_t[] _textFieldIndices; 1755 1756 /* Called during summarizer setup to add a shared field value for a specific field index. 1757 * eg. '--median 7' will add end up calling addNumericIdex(6), 6 being the zero-based index. 1758 * A specific index is only added once. 1759 */ 1760 final void addNumericIndex (size_t index) 1761 { 1762 if (!canFind(_numericFieldIndices, index)) _numericFieldIndices ~= index; 1763 } 1764 1765 /* Similar to addNumericIndex, except adds a text index. */ 1766 final void addTextIndex (size_t index) 1767 { 1768 if (!canFind(_textFieldIndices, index)) _textFieldIndices ~= index; 1769 } 1770 1771 /* Called every time a new key is found, or once at the beginning of the program if no keys 1772 * are being used (entire column summarized). 1773 */ 1774 final UniqueKeyValuesLists makeUniqueKeyValuesLists() 1775 { 1776 return new UniqueKeyValuesLists(_numericFieldIndices, _textFieldIndices); 1777 } 1778 } 1779 1780 class UniqueKeyValuesLists 1781 { 1782 /* A FieldValues object holds is a list of values collect for a specific field. A 1783 * unique key may hold several. For example, the command: 1784 * $ tsv-summarize --k 1 --median 4 -- median 5 1785 * requires keeping lists for both fields 4 and 5. This in turn will result in a 1786 * _numericFieldValues being a 2 element array, one with a list of field 4 values, 1787 * the second of field 5 values. Linear search is used to find a specific field. 1788 */ 1789 private FieldValues!double[] _numericFieldValues; 1790 private FieldValues!string[] _textFieldValues; 1791 private double[] _numericFieldMedians; 1792 1793 /* The UniqueKeyValuesLists constructor takes arrays of field indices to be saved. */ 1794 this(const size_t[] numericFieldIndices, const size_t[] textFieldIndices) 1795 { 1796 if (numericFieldIndices.length > 0) 1797 { 1798 _numericFieldValues = new FieldValues!double[](numericFieldIndices.length); 1799 foreach (i, fieldIndex; numericFieldIndices) 1800 _numericFieldValues[i] = new FieldValues!double(fieldIndex); 1801 } 1802 1803 if (textFieldIndices.length > 0) 1804 { 1805 _textFieldValues = new FieldValues!string[](textFieldIndices.length); 1806 foreach (i, fieldIndex; textFieldIndices) 1807 _textFieldValues[i] = new FieldValues!string(fieldIndex); 1808 } 1809 } 1810 1811 void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy) 1812 { 1813 _numericFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy)); 1814 _textFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy)); 1815 } 1816 1817 private FieldValues!double findNumericFieldValues(size_t index) 1818 { 1819 alias pred = (FieldValues!double a, size_t b) => (a.fieldIndex == b); 1820 auto r = find!pred(_numericFieldValues, index); 1821 assert(!r.empty); 1822 return r.front; 1823 } 1824 1825 private FieldValues!string findTextFieldValues(size_t index) 1826 { 1827 alias pred = (FieldValues!string a, size_t b) => (a.fieldIndex == b); 1828 auto r = find!pred(_textFieldValues, index); 1829 assert(!r.empty); 1830 return r.front; 1831 } 1832 1833 final double[] numericValues(size_t index) 1834 { 1835 return findNumericFieldValues(index).getArray; 1836 } 1837 1838 final double[] numericValuesSorted(size_t index) 1839 { 1840 return findNumericFieldValues(index).getSortedArray; 1841 } 1842 1843 final string[] textValues(size_t index) 1844 { 1845 return findTextFieldValues(index).getArray; 1846 } 1847 1848 final string[] textValuesSorted(size_t index) 1849 { 1850 return findTextFieldValues(index).getSortedArray; 1851 } 1852 1853 final double numericValuesMedian(size_t index) 1854 { 1855 return findNumericFieldValues(index).median; 1856 } 1857 1858 private class FieldValues(ValueType) 1859 { 1860 import std.array : appender; 1861 private size_t _fieldIndex; 1862 private Appender!(ValueType[]) _values; 1863 private bool _haveMedian = false; 1864 private bool _isSorted = false; 1865 private ValueType _medianValue; 1866 1867 this(size_t fieldIndex) 1868 { 1869 _fieldIndex = fieldIndex; 1870 } 1871 1872 final size_t length() const @property 1873 { 1874 return _values.data.length; 1875 } 1876 1877 final size_t fieldIndex() const @property 1878 { 1879 return _fieldIndex; 1880 } 1881 1882 final void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy) 1883 { 1884 debug writefln("[%s]: %s", __FUNCTION__, fields.to!string); 1885 1886 const char[] field = fields[_fieldIndex]; 1887 if (missingPolicy.useMissing || !missingPolicy.isMissingField(field)) 1888 { 1889 _values.put(field.to!ValueType); 1890 _haveMedian = false; 1891 _isSorted = false; 1892 } 1893 else if (missingPolicy.replaceMissing) 1894 { 1895 _values.put(missingPolicy.missingReplacement.to!ValueType); 1896 _haveMedian = false; 1897 _isSorted = false; 1898 } 1899 } 1900 1901 /* Return an input range of the values. */ 1902 final auto values() 1903 { 1904 return _values.data; 1905 } 1906 1907 final ValueType[] getArray() 1908 { 1909 return _values.data; 1910 } 1911 1912 final ValueType[] getSortedArray() 1913 { 1914 if (!_isSorted) 1915 { 1916 import std.algorithm : sort; 1917 sort(_values.data); 1918 _isSorted = true; 1919 } 1920 return _values.data; 1921 } 1922 1923 final ValueType median() 1924 { 1925 if (!_haveMedian) 1926 { 1927 import tsv_numerics : rangeMedian; 1928 _medianValue = _values.data.rangeMedian(); 1929 _haveMedian = true; 1930 } 1931 1932 return _medianValue; 1933 } 1934 } 1935 } 1936 1937 /** SingleFieldOperator is a base class for single field operators, the most common 1938 * Operator. Derived classes implement makeCalculator and the Calculator class it returns. 1939 */ 1940 class SingleFieldOperator : Operator 1941 { 1942 import std.typecons : Flag; 1943 1944 private string _name; 1945 private string _header; 1946 private size_t _fieldIndex; 1947 private bool _useHeaderSuffix; 1948 private bool _allowCustomHeader; 1949 private bool _hasCustomHeader = false; 1950 private size_t[] _numericFieldsToSave; 1951 private size_t[] _textFieldsToSave; 1952 private MissingFieldPolicy _missingPolicy; 1953 1954 this(string operatorName, size_t fieldIndex, MissingFieldPolicy missingPolicy, 1955 Flag!"useHeaderSuffix" useHeaderSuffix = Yes.useHeaderSuffix, 1956 Flag!"allowCustomHeader" allowCustomHeader = Yes.allowCustomHeader) 1957 { 1958 _name = operatorName; 1959 _fieldIndex = fieldIndex; 1960 _missingPolicy = missingPolicy; 1961 _useHeaderSuffix = useHeaderSuffix; 1962 _allowCustomHeader = allowCustomHeader; 1963 // Default header. May be overrridden by custom header or header line. 1964 _header = 1965 fieldHeaderFromIndex(fieldIndex) 1966 .summaryHeaderFromFieldHeader(_useHeaderSuffix ? operatorName : ""); 1967 } 1968 1969 void setCustomHeader (string customHeader) 1970 { 1971 assert(_allowCustomHeader); 1972 _header = customHeader; 1973 _hasCustomHeader = true; 1974 } 1975 1976 final string name() const @property 1977 { 1978 return _name; 1979 } 1980 1981 final bool allowCustomHeader() const @property 1982 { 1983 return _allowCustomHeader; 1984 } 1985 1986 /* saveFieldValues[Numeric|Text] are called by derived classes to indicate that field 1987 * that the field values should be saved. These should called during construction. 1988 */ 1989 final void setSaveFieldValuesNumeric() 1990 { 1991 _numericFieldsToSave ~= _fieldIndex; 1992 } 1993 1994 final void setSaveFieldValuesText() 1995 { 1996 _textFieldsToSave ~= _fieldIndex; 1997 } 1998 1999 final MissingFieldPolicy missingPolicy() @property 2000 { 2001 return _missingPolicy; 2002 } 2003 2004 final size_t fieldIndex() const @property 2005 { 2006 return _fieldIndex; 2007 } 2008 2009 final string header() const @property 2010 { 2011 return _header; 2012 } 2013 2014 final bool useHeaderSuffix() const @property 2015 { 2016 return _useHeaderSuffix; 2017 } 2018 2019 void processHeaderLine(const char[][] fields) 2020 { 2021 if (!_hasCustomHeader) { 2022 debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string); 2023 _header = summaryHeaderFromFieldHeader(fields[_fieldIndex].to!string, 2024 _useHeaderSuffix ? _name : ""); 2025 } 2026 } 2027 2028 final size_t[] numericFieldsToSave() 2029 { 2030 return _numericFieldsToSave; 2031 } 2032 2033 final size_t[] textFieldsToSave() 2034 { 2035 return _textFieldsToSave; 2036 } 2037 2038 abstract SingleFieldCalculator makeCalculator(); 2039 } 2040 2041 /** SingleFieldCalculator is a base class for the common case of calculators using a single 2042 * field. Derived classes implement processNextField() rather than processNextLine(). 2043 */ 2044 class SingleFieldCalculator : Calculator 2045 { 2046 private size_t _fieldIndex; 2047 2048 this(size_t fieldIndex) 2049 { 2050 _fieldIndex = fieldIndex; 2051 } 2052 2053 final size_t fieldIndex() const @property 2054 { 2055 return _fieldIndex; 2056 } 2057 2058 final void processNextLine(const char[][] fields) 2059 { 2060 debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string); 2061 2062 auto missingPolicy = getOperator.missingPolicy; 2063 const char[] field = fields[_fieldIndex]; 2064 2065 if (missingPolicy.useMissing || !missingPolicy.isMissingField(field)) 2066 { 2067 processNextField(field); 2068 } 2069 else if (missingPolicy.replaceMissing) 2070 { 2071 processNextField(missingPolicy.missingReplacement); 2072 } 2073 } 2074 2075 abstract SingleFieldOperator getOperator(); 2076 2077 abstract void processNextField(const char[] field); 2078 } 2079 2080 /* Unittest helper functions. Only compiled when -unittest is in effect. */ 2081 version(unittest) 2082 { 2083 /** A helper for SingleFieldOperator unit tests. 2084 * 2085 * testSingleFieldOperator takes a set of split file values, a field index, a header 2086 * suffix, and a set of expected values. The expected values array contains the 2087 * initial value (zero entries) and the expected values after each line. (One more 2088 * expected value than input lines.) The zero entry case is what is generated for an 2089 * empty file. An example testing the 'min' operator against a file with 2 columns, 2090 * 3 rows, using field index 1: 2091 * 2092 * testSingleFieldOperator!MinOperator( 2093 * [["10", "100"], // The split file. 3 lines by 2 rows. 2094 * ["5", "50"], 2095 * ["20", "200"]], 2096 * 1, // Field index (zero-based, so "100", "50", "200") 2097 * "min", // The header suffix, normally the operator name. 2098 * ["nan", "100", "50", "50"]); // Min value after processing each line. 2099 * 2100 * A typical operator unit test uses three "files", one each of 1x3, 2x3, and 3x3. 2101 * Then run the operator is tested against each column, a total of six calls. Headers 2102 * are automatically checked. Additional entries can be used to extend coverage. 2103 * 2104 * A non-default MissingFieldPolicy can be provide as an optional last argument. 2105 * Operator tests should include exclusion and replacement variations. See operator 2106 * unit tests for details. 2107 * 2108 * The testSingleFieldOperatorBase adds an additional capability - Custom operator 2109 * init arguments. Currently this is used only by the quantile operator. 2110 * 2111 * These tests do not check unique key behavior (group-by). Operators don't have info 2112 * about unique keys, and interact with them only indirectly, via Calculators. 2113 */ 2114 void testSingleFieldOperator(OperatorClass : SingleFieldOperator) 2115 (const char[][][] splitFile, size_t fieldIndex, string headerSuffix, 2116 const char[][] expectedValues, 2117 MissingFieldPolicy missingPolicy = new MissingFieldPolicy) 2118 { 2119 testSingleFieldOperatorBase!OperatorClass(splitFile, fieldIndex, headerSuffix, expectedValues, missingPolicy); 2120 } 2121 2122 void testSingleFieldOperatorBase(OperatorClass : SingleFieldOperator, T...) 2123 (const char[][][] splitFile, size_t fieldIndex, string headerSuffix, 2124 const char[][] expectedValues, 2125 MissingFieldPolicy missingPolicy, 2126 T extraOpInitArgs) 2127 { 2128 import std.format : format; 2129 import std.array : appender; 2130 import std..string : chomp; 2131 import std.traits : EnumMembers; 2132 2133 auto numFields = (splitFile[0]).length; 2134 2135 assert(fieldIndex < numFields, 2136 format("[testSingleFieldOperator] Invalid field index. headerSuffix: %s", 2137 headerSuffix)); 2138 assert(splitFile.length + 1 == expectedValues.length, 2139 format("[testSingleFieldOperator] Need one more expected value than number of rows. headerSuffix: %s", 2140 headerSuffix)); 2141 2142 /* printOptions - Only the 'values-delimiter' (2nd arg) is used these tests. */ 2143 auto printOptions = SummarizerPrintOptions('#', '|'); 2144 2145 /* An input header line. */ 2146 string[] inputHeaderLine = new string[numFields]; 2147 foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string; 2148 2149 /* The different expected output field headers. */ 2150 auto outputFieldHeaderWithNoHeaderLine = 2151 fieldHeaderFromIndex(fieldIndex) 2152 .summaryHeaderFromFieldHeader(headerSuffix); 2153 auto outputFieldHeaderFromHeaderLine = 2154 inputHeaderLine[fieldIndex] 2155 .summaryHeaderFromFieldHeader(headerSuffix); 2156 auto customOutputFieldHeader = "custom"; 2157 2158 enum HeaderUsecase { 2159 HeaderLine_DefaultHeader, 2160 HeaderLine_CustomHeader, 2161 NoHeaderLine_DefaultHeader, 2162 NoHeaderLine_CustomHeader, 2163 NoHeaderLine_NoOutputHeader, 2164 } 2165 2166 string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected) 2167 { 2168 return format("[testSingleFieldOperator] Unexpected header. Operator: %s; Usecase: %s; Actual: '%s'; Expected: '%s'", 2169 op.name, hc, actual, expected); 2170 } 2171 2172 string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex, size_t fieldIndex, 2173 const char[] actual, const char[] expected) 2174 { 2175 return format("[testSingleFieldOperator] Unexpected value. Operator: %s; Usecase: %s; RowIndex: %d, FieldIndex: %d\n Actual: '%s'; Expected: '%s'", 2176 op.name, hc, rowIndex, fieldIndex, actual, expected); 2177 } 2178 2179 /* Run the logic for each header use case. */ 2180 foreach (hc; EnumMembers!HeaderUsecase) 2181 { 2182 bool hasInputHeader = ( 2183 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2184 hc == HeaderUsecase.HeaderLine_CustomHeader 2185 ); 2186 bool hasOutputHeader = ( 2187 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2188 hc == HeaderUsecase.HeaderLine_CustomHeader || 2189 hc == HeaderUsecase.NoHeaderLine_DefaultHeader || 2190 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2191 ); 2192 bool hasCustomHeader = ( 2193 hc == HeaderUsecase.HeaderLine_CustomHeader || 2194 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2195 ); 2196 2197 if (hasCustomHeader) assert(hasOutputHeader); 2198 2199 auto op = new OperatorClass(fieldIndex, missingPolicy, extraOpInitArgs); 2200 2201 if (hasCustomHeader) 2202 { 2203 if (!op.allowCustomHeader) continue; // Custom header not support by this operator 2204 op.setCustomHeader(customOutputFieldHeader); 2205 } 2206 2207 Operator[] operatorArray; 2208 operatorArray ~= op; 2209 2210 auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy); 2211 summarizer.setOperators(inputRangeObject(operatorArray)); 2212 2213 if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine); 2214 2215 if (hasOutputHeader) 2216 { 2217 /* Write the header line. Note that this is a one-field header, */ 2218 auto headerLineOutput = appender!(char[])(); 2219 summarizer.writeSummaryHeader(headerLineOutput, printOptions); 2220 2221 /* Test that the header was generated correctly. 2222 * 2223 * Note: Because the output is generated by a Summarizer, it will have a 2224 * trailing newline. Use chomp to trim it. 2225 */ 2226 final switch (hc) 2227 { 2228 case HeaderUsecase.HeaderLine_DefaultHeader: 2229 assert(headerLineOutput.data.chomp == outputFieldHeaderFromHeaderLine, 2230 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2231 outputFieldHeaderFromHeaderLine)); 2232 break; 2233 case HeaderUsecase.NoHeaderLine_DefaultHeader: 2234 assert(headerLineOutput.data.chomp == outputFieldHeaderWithNoHeaderLine, 2235 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2236 outputFieldHeaderWithNoHeaderLine)); 2237 break; 2238 case HeaderUsecase.HeaderLine_CustomHeader: 2239 case HeaderUsecase.NoHeaderLine_CustomHeader: 2240 assert(headerLineOutput.data.chomp == customOutputFieldHeader, 2241 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2242 customOutputFieldHeader)); 2243 break; 2244 case HeaderUsecase.NoHeaderLine_NoOutputHeader: 2245 break; 2246 } 2247 2248 } 2249 2250 /* For each line, process the line, generate the output, and test that the 2251 * value is correct. Start with the empty file case. 2252 */ 2253 foreach (i, const char[] expected; expectedValues) 2254 { 2255 if (i > 0) summarizer.processNextLine(splitFile[i - 1]); 2256 auto summaryLineOutput = appender!(char[])(); 2257 summarizer.writeSummaryBody(summaryLineOutput, printOptions); 2258 assert(summaryLineOutput.data.chomp == expected, 2259 valueAssertMessage(operatorArray[0], hc, i, fieldIndex, 2260 summaryLineOutput.data.chomp, expectedValues[i])); 2261 } 2262 } 2263 } 2264 } 2265 2266 /** ZeroFieldOperator is a base class for operators that take no input. The main use 2267 * case is the CountOperator, which counts the occurrences of each unique key. Other 2268 * uses are possible, for example, weighted random number assignment. 2269 * 2270 * The primary rationale for ZeroFieldOperator and ZeroFieldCalculator is to clarify 2271 * the information available to such a routine. In particular, the split fields passed 2272 * to processHeaderLine and processNextLine don't include all fields in the input, 2273 * something that might not be obvious when implementing an operator. (Only fields 2274 * required by operators acting on specific fields are included.) 2275 */ 2276 class ZeroFieldOperator : Operator 2277 { 2278 import std.typecons : Flag; 2279 2280 private string _name; 2281 private string _header; 2282 2283 this(string operatorName) 2284 { 2285 _name = operatorName; 2286 _header = operatorName; 2287 } 2288 2289 void setCustomHeader (string customHeader) 2290 { 2291 _header = customHeader; 2292 } 2293 2294 bool allowCustomHeader() const @property 2295 { 2296 return true; 2297 } 2298 2299 final string name() const @property 2300 { 2301 return _name; 2302 } 2303 2304 final string header() const @property 2305 { 2306 return _header; 2307 } 2308 2309 /* A no-op. ZeroFieldOperators have no access to the header line. */ 2310 final void processHeaderLine(const char[][] fields) { } 2311 2312 /* A no-op. ZeroFieldOperators have no access to fields. */ 2313 final size_t[] numericFieldsToSave() 2314 { 2315 size_t[] emptyArray; 2316 return emptyArray; 2317 } 2318 2319 /* A no-op. ZeroFieldOperators have no access to fields. */ 2320 final size_t[] textFieldsToSave() 2321 { 2322 size_t[] emptyArray; 2323 return emptyArray; 2324 } 2325 2326 abstract ZeroFieldCalculator makeCalculator(); 2327 } 2328 2329 /** ZeroFieldCalculator is a base class for operators that don't use fields as input. 2330 * In particular, the Count operator. It is a companion to the ZeroFieldOperator class. 2331 * 2332 * Derived classes implement processNextEntry() rather than processNextLine(), and the 2333 * single argument form of calculate() given as an abstract function. 2334 */ 2335 class ZeroFieldCalculator : Calculator 2336 { 2337 this() { } 2338 2339 final void processNextLine(const char[][] fields) 2340 { 2341 debug writefln("[%s]", __FUNCTION__,); 2342 processNextEntry(); 2343 } 2344 2345 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2346 { 2347 return calculate(printOptions); 2348 } 2349 2350 abstract void processNextEntry(); 2351 abstract string calculate(const ref SummarizerPrintOptions printOptions); 2352 } 2353 2354 version(unittest) 2355 { 2356 /* A helper for ZeroFieldOperator unit tests. 2357 * 2358 * testZeroFieldOperator takes a set of split file values, a default header, and a 2359 * set of expected values. The expected values array contains the expected values 2360 * after each line. 2361 * 2362 * testZeroFieldOperator is very similar to testSingleFieldOperator, except that 2363 * there is no use of field indices and fewer types of headers. See the latter's 2364 * documentation and the CountOperator unit tests for examples. 2365 */ 2366 void testZeroFieldOperator(OperatorClass : ZeroFieldOperator) 2367 (const char[][][] splitFile, string defaultHeader, const char[][] expectedValues) 2368 { 2369 import std.format : format; 2370 import std.array : appender; 2371 import std..string : chomp; 2372 import std.traits : EnumMembers; 2373 2374 auto numFields = (splitFile[0]).length; 2375 2376 assert(splitFile.length + 1 == expectedValues.length, 2377 format("[testZeroFieldOperator] Need one more expected value than number of rows. headerSuffix: %s", 2378 defaultHeader)); 2379 2380 /* printOptions - Not used these tests, but needed for API calls. */ 2381 auto printOptions = SummarizerPrintOptions('#', '|'); 2382 2383 /* Missing policy doesn't apply to zero field operators, but need the object for the summarizer. */ 2384 auto missingPolicy = new MissingFieldPolicy; 2385 2386 /* An input header line. */ 2387 string[] inputHeaderLine = new string[numFields]; 2388 foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string; 2389 2390 auto customOutputFieldHeader = "custom"; 2391 2392 enum HeaderUsecase { 2393 HeaderLine_DefaultHeader, 2394 HeaderLine_CustomHeader, 2395 NoHeaderLine_DefaultHeader, 2396 NoHeaderLine_CustomHeader, 2397 NoHeaderLine_NoOutputHeader, 2398 } 2399 2400 string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected) 2401 { 2402 return format("[testZeroFieldOperator] Unexpected header. Operator: %s; Usecase: %s; Actual: '%s'; Expected: '%s'", 2403 op.name, hc, actual, expected); 2404 } 2405 2406 string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex, 2407 const char[] actual, const char[] expected) 2408 { 2409 return format("[testZeroFieldOperator] Unexpected value. Operator: %s; Usecase: %s; RowIndex: %d\n Actual: '%s'; Expected: '%s'", 2410 op.name, hc, rowIndex, actual, expected); 2411 } 2412 2413 /* Run the logic for each header use case. */ 2414 foreach (hc; EnumMembers!HeaderUsecase) 2415 { 2416 bool hasInputHeader = ( 2417 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2418 hc == HeaderUsecase.HeaderLine_CustomHeader 2419 ); 2420 bool hasOutputHeader = ( 2421 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2422 hc == HeaderUsecase.HeaderLine_CustomHeader || 2423 hc == HeaderUsecase.NoHeaderLine_DefaultHeader || 2424 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2425 ); 2426 bool hasCustomHeader = ( 2427 hc == HeaderUsecase.HeaderLine_CustomHeader || 2428 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2429 ); 2430 2431 if (hasCustomHeader) assert(hasOutputHeader); 2432 2433 auto op = new OperatorClass(); 2434 2435 if (hasCustomHeader) 2436 { 2437 if (!op.allowCustomHeader) continue; // Custom header not support by this operator 2438 op.setCustomHeader(customOutputFieldHeader); 2439 } 2440 2441 Operator[] operatorArray; 2442 operatorArray ~= op; 2443 2444 auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy); 2445 summarizer.setOperators(inputRangeObject(operatorArray)); 2446 if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine); 2447 2448 if (hasOutputHeader) 2449 { 2450 /* Write the header line. Note that this is a one-field header, */ 2451 auto headerLineOutput = appender!(char[])(); 2452 summarizer.writeSummaryHeader(headerLineOutput, printOptions); 2453 2454 /* Test that the header was generated correctly. 2455 * 2456 * Note: Because the output is generated by a Summarizer, it will have a 2457 * trailing newline. Use chomp to trim it. 2458 */ 2459 final switch (hc) 2460 { 2461 case HeaderUsecase.HeaderLine_DefaultHeader: 2462 case HeaderUsecase.NoHeaderLine_DefaultHeader: 2463 assert(headerLineOutput.data.chomp == defaultHeader, 2464 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2465 defaultHeader)); 2466 break; 2467 case HeaderUsecase.HeaderLine_CustomHeader: 2468 case HeaderUsecase.NoHeaderLine_CustomHeader: 2469 assert(headerLineOutput.data.chomp == customOutputFieldHeader, 2470 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2471 customOutputFieldHeader)); 2472 break; 2473 case HeaderUsecase.NoHeaderLine_NoOutputHeader: 2474 break; 2475 } 2476 2477 } 2478 2479 /* For each line, process the line, generate the output, and test that the 2480 * value is correct. Start with the empty file case. 2481 */ 2482 foreach (i, const char[] expected; expectedValues) 2483 { 2484 if (i > 0) summarizer.processNextLine(splitFile[i - 1]); 2485 auto summaryLineOutput = appender!(char[])(); 2486 summarizer.writeSummaryBody(summaryLineOutput, printOptions); 2487 assert(summaryLineOutput.data.chomp == expected, 2488 valueAssertMessage(operatorArray[0], hc, i, 2489 summaryLineOutput.data.chomp, expectedValues[i])); 2490 } 2491 } 2492 } 2493 } 2494 2495 /* Specific operators. 2496 * 2497 * Notes: 2498 * - The 'Calculator' inner classes are 'static'. This means inner class instances do not 2499 * keep a reference to the context of the outer class. In exchange, Calculator instances 2500 * need to hold all needed state, typically the field index they are summarizing. 2501 */ 2502 2503 /** CountOperator counts the number of occurrences of each unique key, or the number of 2504 * input lines if there is no unique key. 2505 * 2506 * CountOperator differs from most other operators in that it doesn't summarize a specific 2507 * field on the line. Instead it is summarizing a property of the unique key itself. For 2508 * this reason it doesn't derive from SingleFieldOperator. 2509 */ 2510 class CountOperator : ZeroFieldOperator 2511 { 2512 this() 2513 { 2514 super("count"); 2515 } 2516 2517 final override ZeroFieldCalculator makeCalculator() 2518 { 2519 return new CountCalculator(); 2520 } 2521 2522 static class CountCalculator : ZeroFieldCalculator 2523 { 2524 private size_t _count = 0; 2525 2526 final override void processNextEntry() 2527 { 2528 _count++; 2529 } 2530 2531 final override string calculate(const ref SummarizerPrintOptions printOptions) 2532 { 2533 return printOptions.formatNumber(_count); 2534 } 2535 } 2536 } 2537 2538 unittest // CountOperator 2539 { 2540 auto col1File = [["10"], ["9.5"], ["11"]]; 2541 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 2542 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 2543 2544 testZeroFieldOperator!CountOperator(col1File, "count", ["0", "1", "2", "3"]); 2545 testZeroFieldOperator!CountOperator(col2File, "count", ["0", "1", "2", "3"]); 2546 testZeroFieldOperator!CountOperator(col3File, "count", ["0", "1", "2", "3"]); 2547 } 2548 2549 /** RetainOperator retains the first occurrence of a field, without changing the header. 2550 * 2551 * RetainOperator is intended for fields where the value is expected to be the same for 2552 * all occurrences of the unique key, and the goal is to pass the value through unchanged. 2553 * It is like FirstOperator, except that the original header is preserved. The original 2554 * header preservation is setup in the call to the SingleFieldOperation constructor. 2555 * 2556 * Notes: 2557 * - An option to signal an error if multiple values are encountered might be useful. 2558 */ 2559 class RetainOperator : SingleFieldOperator 2560 { 2561 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2562 { 2563 super("retain", fieldIndex, missingPolicy, No.useHeaderSuffix, No.allowCustomHeader); 2564 } 2565 2566 final override SingleFieldCalculator makeCalculator() 2567 { 2568 return new RetainCalculator(fieldIndex); 2569 } 2570 2571 class RetainCalculator : SingleFieldCalculator 2572 { 2573 private bool _done = false; 2574 private string _value = ""; 2575 2576 this(size_t fieldIndex) 2577 { 2578 super(fieldIndex); 2579 } 2580 2581 final override RetainOperator getOperator() 2582 { 2583 return this.outer; 2584 } 2585 2586 final override void processNextField(const char[] nextField) 2587 { 2588 if (!_done) 2589 { 2590 _value = nextField.to!string; 2591 _done = true; 2592 } 2593 } 2594 2595 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2596 { 2597 return _value; 2598 } 2599 } 2600 } 2601 2602 unittest // RetainOperator 2603 { 2604 auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]]; 2605 auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]]; 2606 auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]]; 2607 2608 testSingleFieldOperator!RetainOperator(col1File, 0, "", ["", "r1c1", "r1c1", "r1c1"]); 2609 testSingleFieldOperator!RetainOperator(col2File, 0, "", ["", "r1c1", "r1c1", "r1c1"]); 2610 testSingleFieldOperator!RetainOperator(col2File, 1, "", ["", "r1c2", "r1c2", "r1c2"]); 2611 testSingleFieldOperator!RetainOperator(col3File, 0, "", ["", "r1c1", "r1c1", "r1c1"]); 2612 testSingleFieldOperator!RetainOperator(col3File, 1, "", ["", "r1c2", "r1c2", "r1c2"]); 2613 testSingleFieldOperator!RetainOperator(col3File, 2, "", ["", "r1c3", "r1c3", "r1c3"]); 2614 2615 auto col1misFile = [[""], ["r2c1"], ["r3c1"]]; 2616 testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "", "r2c1", "r2c1"], 2617 new MissingFieldPolicy(true, "")); // Exclude missing 2618 testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "NA", "NA", "NA"], 2619 new MissingFieldPolicy(false, "NA")); // Replace missing 2620 } 2621 2622 /** FirstOperator outputs the first value found for the field. 2623 */ 2624 class FirstOperator : SingleFieldOperator 2625 { 2626 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2627 { 2628 super("first", fieldIndex, missingPolicy); 2629 } 2630 2631 final override SingleFieldCalculator makeCalculator() 2632 { 2633 return new FirstCalculator(fieldIndex); 2634 } 2635 2636 class FirstCalculator : SingleFieldCalculator 2637 { 2638 private bool _done = false; 2639 private string _value = ""; 2640 2641 this(size_t fieldIndex) 2642 { 2643 super(fieldIndex); 2644 } 2645 2646 final override FirstOperator getOperator() 2647 { 2648 return this.outer; 2649 } 2650 2651 final override void processNextField(const char[] nextField) 2652 { 2653 if (!_done) 2654 { 2655 _value = nextField.to!string; 2656 _done = true; 2657 } 2658 } 2659 2660 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2661 { 2662 return _value; 2663 } 2664 } 2665 } 2666 2667 unittest // FirstOperator 2668 { 2669 auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]]; 2670 auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]]; 2671 auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]]; 2672 2673 testSingleFieldOperator!FirstOperator(col1File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]); 2674 testSingleFieldOperator!FirstOperator(col2File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]); 2675 testSingleFieldOperator!FirstOperator(col2File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]); 2676 testSingleFieldOperator!FirstOperator(col3File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]); 2677 testSingleFieldOperator!FirstOperator(col3File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]); 2678 testSingleFieldOperator!FirstOperator(col3File, 2, "first", ["", "r1c3", "r1c3", "r1c3"]); 2679 2680 auto col1misFile = [[""], ["r2c1"], ["r3c1"]]; 2681 testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "", "r2c1", "r2c1"], 2682 new MissingFieldPolicy(true, "")); // Exclude missing 2683 testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "NA", "NA", "NA"], 2684 new MissingFieldPolicy(false, "NA")); // Replace missing 2685 } 2686 2687 /** LastOperator outputs the last value found for the field. 2688 */ 2689 class LastOperator : SingleFieldOperator 2690 { 2691 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2692 { 2693 super("last", fieldIndex, missingPolicy); 2694 } 2695 2696 final override SingleFieldCalculator makeCalculator() 2697 { 2698 return new LastCalculator(fieldIndex); 2699 } 2700 2701 class LastCalculator : SingleFieldCalculator 2702 { 2703 private string _value = ""; 2704 2705 this(size_t fieldIndex) 2706 { 2707 super(fieldIndex); 2708 } 2709 2710 final override LastOperator getOperator() 2711 { 2712 return this.outer; 2713 } 2714 2715 final override void processNextField(const char[] nextField) 2716 { 2717 _value = nextField.to!string; 2718 } 2719 2720 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2721 { 2722 return _value; 2723 } 2724 } 2725 } 2726 2727 unittest // LastOperator 2728 { 2729 auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]]; 2730 auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]]; 2731 auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]]; 2732 2733 testSingleFieldOperator!LastOperator(col1File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]); 2734 testSingleFieldOperator!LastOperator(col2File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]); 2735 testSingleFieldOperator!LastOperator(col2File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]); 2736 testSingleFieldOperator!LastOperator(col3File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]); 2737 testSingleFieldOperator!LastOperator(col3File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]); 2738 testSingleFieldOperator!LastOperator(col3File, 2, "last", ["", "r1c3", "r2c3", "r3c3"]); 2739 2740 auto col1misFile = [[""], ["r2c1"], ["r3c1"]]; 2741 testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "", "r2c1", "r3c1"], 2742 new MissingFieldPolicy(true, "")); // Exclude missing 2743 testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "NA", "r2c1", "r3c1"], 2744 new MissingFieldPolicy(false, "NA")); // Replace missing 2745 } 2746 2747 /** MinOperator output the minimum value for the field. This is a numeric operator. 2748 */ 2749 class MinOperator : SingleFieldOperator 2750 { 2751 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2752 { 2753 super("min", fieldIndex, missingPolicy); 2754 } 2755 2756 final override SingleFieldCalculator makeCalculator() 2757 { 2758 return new MinCalculator(fieldIndex); 2759 } 2760 2761 class MinCalculator : SingleFieldCalculator 2762 { 2763 private bool _isFirst = true; 2764 private double _value = double.nan; 2765 2766 this(size_t fieldIndex) 2767 { 2768 super(fieldIndex); 2769 } 2770 2771 final override MinOperator getOperator() 2772 { 2773 return this.outer; 2774 } 2775 2776 final override void processNextField(const char[] nextField) 2777 { 2778 double fieldValue = nextField.to!double; 2779 if (_isFirst) 2780 { 2781 _value = fieldValue; 2782 _isFirst = false; 2783 } 2784 else if (fieldValue < _value) 2785 { 2786 _value = fieldValue; 2787 } 2788 } 2789 2790 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2791 { 2792 return printOptions.formatNumber(_value); 2793 } 2794 } 2795 } 2796 2797 unittest // MinOperator 2798 { 2799 auto col1File = [["10"], ["9.5"], ["11"]]; 2800 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 2801 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 2802 2803 testSingleFieldOperator!MinOperator(col1File, 0, "min", ["nan", "10", "9.5", "9.5"]); 2804 testSingleFieldOperator!MinOperator(col2File, 0, "min", ["nan", "20", "20", "20"]); 2805 testSingleFieldOperator!MinOperator(col2File, 1, "min", ["nan", "-30", "-30", "-31"]); 2806 testSingleFieldOperator!MinOperator(col3File, 0, "min", ["nan", "9009", "199", "199"]); 2807 testSingleFieldOperator!MinOperator(col3File, 1, "min", ["nan", "9", "0", "0"]); 2808 testSingleFieldOperator!MinOperator(col3File, 2, "min", ["nan", "-4.5", "-4.5", "-4.5"]); 2809 2810 auto col1misFile = [[""], ["10"], ["-10"]]; 2811 testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "nan", "10", "-10"], 2812 new MissingFieldPolicy(true, "")); // Exclude missing 2813 testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "5", "5", "-10"], 2814 new MissingFieldPolicy(false, "5")); // Replace missing 2815 } 2816 2817 /** MaxOperator output the maximum value for the field. This is a numeric operator. 2818 */ 2819 class MaxOperator : SingleFieldOperator 2820 { 2821 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2822 { 2823 super("max", fieldIndex, missingPolicy); 2824 } 2825 2826 final override SingleFieldCalculator makeCalculator() 2827 { 2828 return new MaxCalculator(fieldIndex); 2829 } 2830 2831 class MaxCalculator : SingleFieldCalculator 2832 { 2833 private bool _isFirst = true; 2834 private double _value = double.nan; 2835 2836 this(size_t fieldIndex) 2837 { 2838 super(fieldIndex); 2839 } 2840 2841 final override MaxOperator getOperator() 2842 { 2843 return this.outer; 2844 } 2845 2846 final override void processNextField(const char[] nextField) 2847 { 2848 double fieldValue = nextField.to!double; 2849 if (_isFirst) 2850 { 2851 _value = fieldValue; 2852 _isFirst = false; 2853 } 2854 else if (fieldValue > _value) 2855 { 2856 _value = fieldValue; 2857 } 2858 } 2859 2860 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2861 { 2862 return printOptions.formatNumber(_value); 2863 } 2864 } 2865 } 2866 2867 unittest // MaxOperator 2868 { 2869 auto col1File = [["10"], ["9.5"], ["11"]]; 2870 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 2871 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 2872 2873 testSingleFieldOperator!MaxOperator(col1File, 0, "max", ["nan", "10", "10", "11"]); 2874 testSingleFieldOperator!MaxOperator(col2File, 0, "max", ["nan", "20", "21", "22"]); 2875 testSingleFieldOperator!MaxOperator(col2File, 1, "max", ["nan", "-30", "-29", "-29"]); 2876 testSingleFieldOperator!MaxOperator(col3File, 0, "max", ["nan", "9009", "9009", "9009"]); 2877 testSingleFieldOperator!MaxOperator(col3File, 1, "max", ["nan", "9", "9", "9"]); 2878 testSingleFieldOperator!MaxOperator(col3File, 2, "max", ["nan", "-4.5", "-0.5", "12"]); 2879 2880 auto col1misFile = [[""], ["-10"], ["10"]]; 2881 testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "nan", "-10", "10"], 2882 new MissingFieldPolicy(true, "")); // Exclude missing 2883 testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "5", "5", "10"], 2884 new MissingFieldPolicy(false, "5")); // Replace missing 2885 } 2886 2887 /** RangeOperator outputs the difference between the minimum and maximum values. 2888 * 2889 * If there is a single value, or all values are the same, the range is zero. This is 2890 * a numeric operator. 2891 */ 2892 class RangeOperator : SingleFieldOperator 2893 { 2894 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2895 { 2896 super("range", fieldIndex, missingPolicy); 2897 } 2898 2899 final override SingleFieldCalculator makeCalculator() 2900 { 2901 return new RangeCalculator(fieldIndex); 2902 } 2903 2904 class RangeCalculator : SingleFieldCalculator 2905 { 2906 private bool _isFirst = true; 2907 private double _minValue = 0.0; 2908 private double _maxValue = 0.0; 2909 2910 this(size_t fieldIndex) 2911 { 2912 super(fieldIndex); 2913 } 2914 2915 final override RangeOperator getOperator() 2916 { 2917 return this.outer; 2918 } 2919 2920 final override void processNextField(const char[] nextField) 2921 { 2922 double fieldValue = nextField.to!double; 2923 if (_isFirst) 2924 { 2925 _minValue = _maxValue = fieldValue; 2926 _isFirst = false; 2927 } 2928 else if (fieldValue > _maxValue) 2929 { 2930 _maxValue = fieldValue; 2931 } 2932 else if (fieldValue < _minValue) 2933 { 2934 _minValue = fieldValue; 2935 } 2936 } 2937 2938 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2939 { 2940 return printOptions.formatNumber(_maxValue - _minValue); 2941 } 2942 } 2943 } 2944 2945 unittest // RangeOperator 2946 { 2947 auto col1File = [["10"], ["9.5"], ["11"]]; 2948 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 2949 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 2950 2951 testSingleFieldOperator!RangeOperator(col1File, 0, "range", ["0", "0", "0.5", "1.5"]); 2952 testSingleFieldOperator!RangeOperator(col2File, 0, "range", ["0", "0", "1", "2"]); 2953 testSingleFieldOperator!RangeOperator(col2File, 1, "range", ["0", "0", "1", "2"]); 2954 testSingleFieldOperator!RangeOperator(col3File, 0, "range", ["0", "0", "8810", "8810"]); 2955 testSingleFieldOperator!RangeOperator(col3File, 1, "range", ["0", "0", "9", "9"]); 2956 testSingleFieldOperator!RangeOperator(col3File, 2, "range", ["0", "0", "4", "16.5"]); 2957 2958 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]]; 2959 testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "0", "0", "0.5", "1.5"], 2960 new MissingFieldPolicy(true, "")); // Exclude missing 2961 testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "4.5", "4.5", "4.5", "5.5"], 2962 new MissingFieldPolicy(false, "5.5")); // Replace missing 2963 } 2964 2965 /** SumOperator produces the sum of all the values. This is a numeric operator. 2966 */ 2967 class SumOperator : SingleFieldOperator 2968 { 2969 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2970 { 2971 super("sum", fieldIndex, missingPolicy); 2972 } 2973 2974 final override SingleFieldCalculator makeCalculator() 2975 { 2976 return new SumCalculator(fieldIndex); 2977 } 2978 2979 class SumCalculator : SingleFieldCalculator 2980 { 2981 private double _total = 0.0; 2982 2983 this(size_t fieldIndex) 2984 { 2985 super(fieldIndex); 2986 } 2987 2988 final override SumOperator getOperator() 2989 { 2990 return this.outer; 2991 } 2992 2993 final override void processNextField(const char[] nextField) 2994 { 2995 _total += nextField.to!double; 2996 } 2997 2998 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2999 { 3000 return printOptions.formatNumber(_total); 3001 } 3002 } 3003 } 3004 3005 unittest // SumOperator 3006 { 3007 auto col1File = [["10"], ["9.5"], ["11"]]; 3008 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3009 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 3010 3011 testSingleFieldOperator!SumOperator(col1File, 0, "sum", ["0", "10", "19.5", "30.5"]); 3012 testSingleFieldOperator!SumOperator(col2File, 0, "sum", ["0", "20", "41", "63"]); 3013 testSingleFieldOperator!SumOperator(col2File, 1, "sum", ["0", "-30", "-59", "-90"]); 3014 testSingleFieldOperator!SumOperator(col3File, 0, "sum", ["0", "9009", "9208", "12211"]); 3015 testSingleFieldOperator!SumOperator(col3File, 1, "sum", ["0", "9", "9", "9.2"]); 3016 testSingleFieldOperator!SumOperator(col3File, 2, "sum", ["0", "-4.5", "-5", "7"]); 3017 3018 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]]; 3019 testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "0", "10", "10", "19.5", "30.5"], 3020 new MissingFieldPolicy(true, "")); // Exclude missing 3021 testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "1.5", "11.5", "13", "22.5", "33.5"], 3022 new MissingFieldPolicy(false, "1.5")); // Replace missing 3023 } 3024 3025 /** MeanOperator produces the mean (average) of all the values. This is a numeric operator. 3026 */ 3027 class MeanOperator : SingleFieldOperator 3028 { 3029 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3030 { 3031 super("mean", fieldIndex, missingPolicy); 3032 } 3033 3034 final override SingleFieldCalculator makeCalculator() 3035 { 3036 return new MeanCalculator(fieldIndex); 3037 } 3038 3039 class MeanCalculator : SingleFieldCalculator 3040 { 3041 private double _total = 0.0; 3042 private size_t _count = 0; 3043 3044 this(size_t fieldIndex) 3045 { 3046 super(fieldIndex); 3047 } 3048 3049 final override MeanOperator getOperator() 3050 { 3051 return this.outer; 3052 } 3053 3054 final override void processNextField(const char[] nextField) 3055 { 3056 _total += nextField.to!double; 3057 _count++; 3058 } 3059 3060 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3061 { 3062 return printOptions.formatNumber( 3063 (_count > 0) ? (_total / _count.to!double) : double.nan); 3064 } 3065 } 3066 } 3067 3068 unittest // MeanOperator 3069 { 3070 auto col1File = [["10"], ["9.5"], ["7.5"]]; 3071 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3072 auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]]; 3073 3074 testSingleFieldOperator!MeanOperator(col1File, 0, "mean", ["nan", "10", "9.75", "9"]); 3075 testSingleFieldOperator!MeanOperator(col2File, 0, "mean", ["nan", "20", "20.5", "21"]); 3076 testSingleFieldOperator!MeanOperator(col2File, 1, "mean", ["nan", "-30", "-29.5", "-30"]); 3077 testSingleFieldOperator!MeanOperator(col3File, 0, "mean", ["nan", "9009", "4509", "4509"]); 3078 testSingleFieldOperator!MeanOperator(col3File, 1, "mean", ["nan", "9", "4.5", "2"]); 3079 testSingleFieldOperator!MeanOperator(col3File, 2, "mean", ["nan", "-4.5", "-3", "2"]); 3080 3081 auto col1misFile = [[""], ["6"], [""], ["14"], ["40"]]; 3082 testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "nan", "6", "6", "10", "20"], 3083 new MissingFieldPolicy(true, "")); // Exclude missing 3084 testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "0", "3", "2", "5", "12"], 3085 new MissingFieldPolicy(false, "0")); // Replace missing 3086 } 3087 3088 /** MedianOperator produces the median of all the values. This is a numeric operator. 3089 * 3090 * All the field values are stored in memory as part of this calculation. This is 3091 * handled by unique key value lists. 3092 */ 3093 class MedianOperator : SingleFieldOperator 3094 { 3095 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3096 { 3097 super("median", fieldIndex, missingPolicy); 3098 setSaveFieldValuesNumeric(); 3099 } 3100 3101 final override SingleFieldCalculator makeCalculator() 3102 { 3103 return new MedianCalculator(fieldIndex); 3104 } 3105 3106 class MedianCalculator : SingleFieldCalculator 3107 { 3108 this(size_t fieldIndex) 3109 { 3110 super(fieldIndex); 3111 } 3112 3113 final override MedianOperator getOperator() 3114 { 3115 return this.outer; 3116 } 3117 3118 /* Work is done by saving the field values. */ 3119 final override void processNextField(const char[] nextField) 3120 { } 3121 3122 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3123 { 3124 return printOptions.formatNumber(valuesLists.numericValuesMedian(fieldIndex)); 3125 } 3126 } 3127 } 3128 3129 unittest // MedianOperator 3130 { 3131 auto col1File = [["10"], ["9.5"], ["7.5"]]; 3132 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3133 auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]]; 3134 3135 testSingleFieldOperator!MedianOperator(col1File, 0, "median", ["nan", "10", "9.75", "9.5"]); 3136 testSingleFieldOperator!MedianOperator(col2File, 0, "median", ["nan", "20", "20.5", "21"]); 3137 testSingleFieldOperator!MedianOperator(col2File, 1, "median", ["nan", "-30", "-29.5", "-30"]); 3138 testSingleFieldOperator!MedianOperator(col3File, 0, "median", ["nan", "9009", "4509", "4509"]); 3139 testSingleFieldOperator!MedianOperator(col3File, 1, "median", ["nan", "9", "4.5", "0"]); 3140 testSingleFieldOperator!MedianOperator(col3File, 2, "median", ["nan", "-4.5", "-3", "-1.5"]); 3141 3142 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]]; 3143 testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "nan", "10", "10", "9.75", "9.5"], 3144 new MissingFieldPolicy(true, "")); // Exclude missing 3145 testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "0", "5", "0", "4.75", "7.5"], 3146 new MissingFieldPolicy(false, "0")); // Replace missing 3147 } 3148 3149 /** QuantileOperator produces the value representing the data at a cummulative probability. 3150 * This is a numeric operation. 3151 * 3152 * As an example, quantiles might be produced for the 0.25, 0.5, and 0.75 probabilities 3153 * (alternately, the 25th, 50th, and 75th percentile ranks, the 50th percentile being the 3154 * median). Data is sorted is ascending order. This operator takes one percentile, but it 3155 * is common to generate multiple quantile ranks for the same field when summarizing. 3156 * 3157 * All the field's values are stored in memory as part of this calculation. This is 3158 * handled by unique key value lists. 3159 */ 3160 class QuantileOperator : SingleFieldOperator 3161 { 3162 private double _prob; 3163 3164 this(size_t fieldIndex, MissingFieldPolicy missingPolicy, double probability) 3165 { 3166 assert(0.0 <= probability && probability <= 1.0); 3167 import std.format : format; 3168 3169 string header = (probability == 0.0) ? "pct0" : format("pct%02g", probability * 100.0); 3170 super(header, fieldIndex, missingPolicy); 3171 _prob = probability; 3172 setSaveFieldValuesNumeric(); 3173 } 3174 3175 final override SingleFieldCalculator makeCalculator() 3176 { 3177 return new QuantileCalculator(fieldIndex); 3178 } 3179 3180 class QuantileCalculator : SingleFieldCalculator 3181 { 3182 this(size_t fieldIndex) 3183 { 3184 super(fieldIndex); 3185 } 3186 3187 final override QuantileOperator getOperator() 3188 { 3189 return this.outer; 3190 } 3191 3192 /* Work is done by saving the field values. */ 3193 final override void processNextField(const char[] nextField) 3194 { } 3195 3196 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3197 { 3198 import tsv_numerics : quantile; 3199 return printOptions.formatNumber( 3200 quantile(this.outer._prob, valuesLists.numericValuesSorted(fieldIndex))); 3201 } 3202 } 3203 } 3204 3205 unittest // QuantileOperator 3206 { 3207 auto col1File = [["10"], ["9.5"], ["7.5"]]; 3208 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3209 auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]]; 3210 3211 auto defaultMissing = new MissingFieldPolicy; 3212 3213 /* Same as the median tests. */ 3214 testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct50", ["nan", "10", "9.75", "9.5"], defaultMissing, 0.50); 3215 testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct50", ["nan", "20", "20.5", "21"], defaultMissing, 0.50); 3216 testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct50", ["nan", "-30", "-29.5", "-30"], defaultMissing, 0.50); 3217 testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct50", ["nan", "9009", "4509", "4509"], defaultMissing, 0.50); 3218 testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct50", ["nan", "9", "4.5", "0"], defaultMissing, 0.50); 3219 testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct50", ["nan", "-4.5", "-3", "-1.5"], defaultMissing, 0.50); 3220 3221 /* The extremes (0, 1), are min and max. */ 3222 testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct0", ["nan", "10", "9.5", "7.5"], defaultMissing, 0.0); 3223 testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct0", ["nan", "20", "20", "20"], defaultMissing, 0.0); 3224 testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct0", ["nan", "-30", "-30", "-31"], defaultMissing, 0.0); 3225 testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct0", ["nan", "9009", "9", "9"], defaultMissing, 0.0); 3226 testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct0", ["nan", "9", "0", "-3"], defaultMissing, 0.0); 3227 testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct0", ["nan", "-4.5", "-4.5", "-4.5"], defaultMissing, 0.0); 3228 3229 testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct100", ["nan", "10", "10", "10"], defaultMissing, 1.0); 3230 testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct100", ["nan", "20", "21", "22"], defaultMissing, 1.0); 3231 testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct100", ["nan", "-30", "-29", "-29"], defaultMissing, 1.0); 3232 testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct100", ["nan", "9009", "9009", "9009"], defaultMissing, 1.0); 3233 testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct100", ["nan", "9", "9", "9"], defaultMissing, 1.0); 3234 testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct100", ["nan", "-4.5", "-1.5", "12"], defaultMissing, 1.0); 3235 3236 /* For missing policies, re-use the median tests. */ 3237 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]]; 3238 testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "nan", "10", "10", "9.75", "9.5"], 3239 new MissingFieldPolicy(true, ""), 0.5); // Exclude missing 3240 testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "0", "5", "0", "4.75", "7.5"], 3241 new MissingFieldPolicy(false, "0"), 0.5); // Replace missing 3242 } 3243 3244 /** MadOperator produces the median absolute deviation from the median. This is a numeric 3245 * operation. 3246 * 3247 * The result is the raw MAD value, without a normalization applied. 3248 * 3249 * All the field values are stored in memory as part of this calculation. This is 3250 * handled by unique key value lists. 3251 */ 3252 class MadOperator : SingleFieldOperator 3253 { 3254 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3255 { 3256 super("mad", fieldIndex, missingPolicy); 3257 setSaveFieldValuesNumeric(); 3258 } 3259 3260 final override SingleFieldCalculator makeCalculator() 3261 { 3262 return new MadCalculator(fieldIndex); 3263 } 3264 3265 class MadCalculator : SingleFieldCalculator 3266 { 3267 this(size_t fieldIndex) 3268 { 3269 super(fieldIndex); 3270 } 3271 3272 final override MadOperator getOperator() 3273 { 3274 return this.outer; 3275 } 3276 3277 /* Work is done by saving the field values. */ 3278 final override void processNextField(const char[] nextField) 3279 { } 3280 3281 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3282 { 3283 import std.math : abs; 3284 import tsv_numerics : rangeMedian; 3285 3286 auto median = valuesLists.numericValuesMedian(fieldIndex); 3287 auto values = valuesLists.numericValues(fieldIndex); 3288 auto medianDevs = new double[values.length]; 3289 foreach (int i, double v; values) 3290 medianDevs[i] = abs(v - median); 3291 3292 return printOptions.formatNumber(medianDevs.rangeMedian); 3293 } 3294 } 3295 } 3296 3297 unittest // MadOperator 3298 { 3299 auto col1File = [["10"], ["15"], ["20"], ["25"], ["30"]]; 3300 auto col2File = [["2", "50"], ["2", "51"], ["2", "52"]]; 3301 auto col3File = [["16", "8", "-4"], ["8", "8", "-2"], ["8", "16", "0"]]; 3302 3303 testSingleFieldOperator!MadOperator(col1File, 0, "mad", ["nan", "0", "2.5", "5", "5", "5"]); 3304 testSingleFieldOperator!MadOperator(col2File, 0, "mad", ["nan", "0", "0", "0"]); 3305 testSingleFieldOperator!MadOperator(col2File, 1, "mad", ["nan", "0", "0.5", "1"]); 3306 testSingleFieldOperator!MadOperator(col3File, 0, "mad", ["nan", "0", "4", "0"]); 3307 testSingleFieldOperator!MadOperator(col3File, 1, "mad", ["nan", "0", "0", "0"]); 3308 testSingleFieldOperator!MadOperator(col3File, 2, "mad", ["nan", "0", "1", "2"]); 3309 3310 auto col1misFile = [[""], ["16"], [""], ["32"], ["-4"]]; 3311 testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "nan", "0", "0", "8", "16"], 3312 new MissingFieldPolicy(true, "")); // Exclude missing 3313 testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "0", "8", "0", "8", "4"], 3314 new MissingFieldPolicy(false, "0")); // Replace missing 3315 } 3316 3317 /** Generates the variance of the fields values. This is a numeric operator. 3318 */ 3319 class VarianceOperator : SingleFieldOperator 3320 { 3321 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3322 { 3323 super("var", fieldIndex, missingPolicy); 3324 } 3325 3326 final override SingleFieldCalculator makeCalculator() 3327 { 3328 return new VarianceCalculator(fieldIndex); 3329 } 3330 3331 class VarianceCalculator : SingleFieldCalculator 3332 { 3333 private double _count = 0.0; 3334 private double _mean = 0.0; 3335 private double _m2 = 0.0; // Sum of squares of differences from current mean 3336 3337 this(size_t fieldIndex) 3338 { 3339 super(fieldIndex); 3340 } 3341 3342 final override VarianceOperator getOperator() 3343 { 3344 return this.outer; 3345 } 3346 3347 final override void processNextField(const char[] nextField) 3348 { 3349 _count += 1.0; 3350 double fieldValue = nextField.to!double; 3351 double delta = fieldValue - _mean; 3352 _mean += delta / _count; 3353 _m2 += delta * (fieldValue - _mean); 3354 } 3355 3356 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3357 { 3358 return printOptions.formatNumber( 3359 (_count >= 2.0) ? (_m2 / (_count - 1.0)) : double.nan); 3360 } 3361 } 3362 } 3363 3364 unittest // VarianceOperator 3365 { 3366 auto col1File = [["5"], ["10"], ["15"]]; 3367 auto col2File = [["-5", "-5"], ["-10", "0"], ["-15", "5"]]; 3368 auto col3File = [["1", "2", "100"], ["2", "3", "100"], ["3", "4", "103"]]; 3369 3370 testSingleFieldOperator!VarianceOperator(col1File, 0, "var", ["nan", "nan", "12.5", "25"]); 3371 testSingleFieldOperator!VarianceOperator(col2File, 0, "var", ["nan", "nan", "12.5", "25"]); 3372 testSingleFieldOperator!VarianceOperator(col2File, 1, "var", ["nan", "nan", "12.5", "25"]); 3373 testSingleFieldOperator!VarianceOperator(col3File, 0, "var", ["nan", "nan", "0.5", "1"]); 3374 testSingleFieldOperator!VarianceOperator(col3File, 1, "var", ["nan", "nan", "0.5", "1"]); 3375 testSingleFieldOperator!VarianceOperator(col3File, 2, "var", ["nan", "nan", "0", "3"]); 3376 3377 auto col1misFile = [["5"], ["10"], [""]]; 3378 testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "12.5"], 3379 new MissingFieldPolicy(true, "")); // Exclude missing 3380 testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "25"], 3381 new MissingFieldPolicy(false, "15")); // Replace missing 3382 } 3383 3384 /** Generates the standard deviation of the fields values. This is a numeric operator. 3385 */ 3386 class StDevOperator : SingleFieldOperator 3387 { 3388 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3389 { 3390 super("stdev", fieldIndex, missingPolicy); 3391 } 3392 3393 final override SingleFieldCalculator makeCalculator() 3394 { 3395 return new StDevCalculator(fieldIndex); 3396 } 3397 3398 class StDevCalculator : SingleFieldCalculator 3399 { 3400 private double _count = 0.0; 3401 private double _mean = 0.0; 3402 private double _m2 = 0.0; // Sum of squares of differences from current mean 3403 3404 this(size_t fieldIndex) 3405 { 3406 super(fieldIndex); 3407 } 3408 3409 final override StDevOperator getOperator() 3410 { 3411 return this.outer; 3412 } 3413 3414 final override void processNextField(const char[] nextField) 3415 { 3416 _count += 1.0; 3417 double fieldValue = nextField.to!double; 3418 double delta = fieldValue - _mean; 3419 _mean += delta / _count; 3420 _m2 += delta * (fieldValue - _mean); 3421 } 3422 3423 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3424 { 3425 import std.math : sqrt; 3426 return printOptions.formatNumber( 3427 (_count >= 2.0) ? (_m2 / (_count - 1.0)).sqrt : double.nan); 3428 } 3429 } 3430 } 3431 3432 /* StDevOperator unit tests - These would be improved with a tolerance option. 3433 */ 3434 unittest 3435 { 3436 auto col1File = [["1"], ["4"], ["7"]]; 3437 auto col2File = [["3", "3"], ["3", "9"], ["7", "15"]]; 3438 auto col3File = [["11", "10", "10"], ["24", "22", "25"], ["37", "34", "40"]]; 3439 3440 testSingleFieldOperator!StDevOperator(col1File, 0, "stdev", ["nan", "nan", "2.12132034356", "3"]); 3441 testSingleFieldOperator!StDevOperator(col2File, 0, "stdev", ["nan", "nan", "0", "2.30940107676"]); 3442 testSingleFieldOperator!StDevOperator(col2File, 1, "stdev", ["nan", "nan", "4.24264068712", "6"]); 3443 testSingleFieldOperator!StDevOperator(col3File, 0, "stdev", ["nan", "nan", "9.19238815543", "13"]); 3444 testSingleFieldOperator!StDevOperator(col3File, 1, "stdev", ["nan", "nan", "8.48528137424", "12"]); 3445 testSingleFieldOperator!StDevOperator(col3File, 2, "stdev", ["nan", "nan", "10.6066017178", "15"]); 3446 3447 auto col1misFile = [["1"], ["4"], [""]]; 3448 testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "2.12132034356"], 3449 new MissingFieldPolicy(true, "")); // Exclude missing 3450 testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "3"], 3451 new MissingFieldPolicy(false, "7")); // Replace missing 3452 } 3453 3454 /** UniqueCountOperator generates the number of unique values. Unique values are 3455 * based on exact text match calculation, not a numeric comparison. 3456 * 3457 * All the unique field values are stored in memory as part of this calculation. 3458 */ 3459 class UniqueCountOperator : SingleFieldOperator 3460 { 3461 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3462 { 3463 super("unique_count", fieldIndex, missingPolicy); 3464 } 3465 3466 final override SingleFieldCalculator makeCalculator() 3467 { 3468 return new UniqueCountCalculator(fieldIndex); 3469 } 3470 3471 class UniqueCountCalculator : SingleFieldCalculator 3472 { 3473 private bool[string] _values; 3474 3475 this(size_t fieldIndex) 3476 { 3477 super(fieldIndex); 3478 } 3479 3480 final override UniqueCountOperator getOperator() 3481 { 3482 return this.outer; 3483 } 3484 3485 final override void processNextField(const char[] nextField) 3486 { 3487 if (nextField !in _values) _values[nextField.to!string] = true; 3488 } 3489 3490 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3491 { 3492 return printOptions.formatNumber(_values.length); 3493 } 3494 } 3495 } 3496 3497 unittest // UniqueCount 3498 { 3499 auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]]; 3500 auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]]; 3501 auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]]; 3502 3503 testSingleFieldOperator!UniqueCountOperator(col1File, 0, "unique_count", ["0", "1", "2", "3", "3", "3", "3", "3", "4"]); 3504 testSingleFieldOperator!UniqueCountOperator(col2File, 0, "unique_count", ["0", "1", "2", "2"]); 3505 testSingleFieldOperator!UniqueCountOperator(col2File, 1, "unique_count", ["0", "1", "1", "2"]); 3506 testSingleFieldOperator!UniqueCountOperator(col3File, 0, "unique_count", ["0", "1", "2", "3"]); 3507 testSingleFieldOperator!UniqueCountOperator(col3File, 1, "unique_count", ["0", "1", "2", "2"]); 3508 testSingleFieldOperator!UniqueCountOperator(col3File, 2, "unique_count", ["0", "1", "2", "3"]); 3509 3510 auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]]; 3511 testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "0", "1", "1", "2", "3", "3", "3", "3", "3", "4"], 3512 new MissingFieldPolicy(true, "")); // Exclude missing 3513 3514 3515 testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "1", "2", "2", "3", "4", "4", "4", "4", "4", "5"], 3516 new MissingFieldPolicy(false, "XYZ")); // Replace missing 3517 } 3518 3519 /** MissingCountOperator generates the number of missing values. This overrides 3520 * the global missingFieldsPolicy. 3521 */ 3522 class MissingCountOperator : SingleFieldOperator 3523 { 3524 private MissingFieldPolicy _globalMissingPolicy; 3525 3526 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3527 { 3528 _globalMissingPolicy = missingPolicy; 3529 super("missing_count", fieldIndex, new MissingFieldPolicy(false, "")); 3530 } 3531 3532 final override SingleFieldCalculator makeCalculator() 3533 { 3534 return new MissingCountCalculator(fieldIndex); 3535 } 3536 3537 class MissingCountCalculator : SingleFieldCalculator 3538 { 3539 private size_t _missingCount = 0; 3540 3541 this(size_t fieldIndex) 3542 { 3543 super(fieldIndex); 3544 } 3545 3546 final override MissingCountOperator getOperator() 3547 { 3548 return this.outer; 3549 } 3550 3551 final override void processNextField(const char[] nextField) 3552 { 3553 if (this.outer._globalMissingPolicy.isMissingField(nextField)) _missingCount++; 3554 } 3555 3556 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3557 { 3558 return printOptions.formatNumber(_missingCount); 3559 } 3560 } 3561 } 3562 3563 unittest // MissingCount 3564 { 3565 auto col1File = [["a"], ["b"], [""], [" "], [""]]; 3566 auto col2File = [["abc", ""], ["", ""], ["def", ""]]; 3567 auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]]; 3568 3569 testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"]); 3570 testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"]); 3571 testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"]); 3572 testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"]); 3573 testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"]); 3574 testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"]); 3575 3576 auto excludeMissing = new MissingFieldPolicy(true, ""); 3577 auto replaceMissing = new MissingFieldPolicy(false, "X"); 3578 3579 testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], excludeMissing); 3580 testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], excludeMissing); 3581 testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], excludeMissing); 3582 testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], excludeMissing); 3583 testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], excludeMissing); 3584 testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], excludeMissing); 3585 3586 testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], replaceMissing); 3587 testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], replaceMissing); 3588 testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], replaceMissing); 3589 testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], replaceMissing); 3590 testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], replaceMissing); 3591 testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], replaceMissing); 3592 } 3593 3594 /** NotMissingCountOperator generates the number of not-missing values. This overrides 3595 * the global missingFieldsPolicy. 3596 */ 3597 class NotMissingCountOperator : SingleFieldOperator 3598 { 3599 private MissingFieldPolicy _globalMissingPolicy; 3600 3601 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3602 { 3603 _globalMissingPolicy = missingPolicy; 3604 super("not_missing_count", fieldIndex, new MissingFieldPolicy(false, "")); 3605 } 3606 3607 final override SingleFieldCalculator makeCalculator() 3608 { 3609 return new NotMissingCountCalculator(fieldIndex); 3610 } 3611 3612 class NotMissingCountCalculator : SingleFieldCalculator 3613 { 3614 private size_t _notMissingCount = 0; 3615 3616 this(size_t fieldIndex) 3617 { 3618 super(fieldIndex); 3619 } 3620 3621 final override NotMissingCountOperator getOperator() 3622 { 3623 return this.outer; 3624 } 3625 3626 final override void processNextField(const char[] nextField) 3627 { 3628 if (!this.outer._globalMissingPolicy.isMissingField(nextField)) _notMissingCount++; 3629 } 3630 3631 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3632 { 3633 return printOptions.formatNumber(_notMissingCount); 3634 } 3635 } 3636 } 3637 3638 unittest // NotMissingCount 3639 { 3640 auto col1File = [["a"], ["b"], [""], [" "], [""]]; 3641 auto col2File = [["abc", ""], ["", ""], ["def", ""]]; 3642 auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]]; 3643 3644 testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"]); 3645 testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"]); 3646 testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"]); 3647 testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"]); 3648 testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"]); 3649 testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"]); 3650 3651 auto excludeMissing = new MissingFieldPolicy(true, ""); 3652 auto replaceMissing = new MissingFieldPolicy(false, "X"); 3653 3654 testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], excludeMissing); 3655 testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], excludeMissing); 3656 testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], excludeMissing); 3657 testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], excludeMissing); 3658 testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], excludeMissing); 3659 testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], excludeMissing); 3660 3661 testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], replaceMissing); 3662 testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], replaceMissing); 3663 testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], replaceMissing); 3664 testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], replaceMissing); 3665 testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], replaceMissing); 3666 testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], replaceMissing); 3667 } 3668 3669 /** ModeOperator outputs the most frequent value seen. In the event of a tie, the 3670 * first value seen is produced. 3671 * 3672 * All the field values are stored in memory as part of this calculation. 3673 * 3674 */ 3675 class ModeOperator : SingleFieldOperator 3676 { 3677 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3678 { 3679 super("mode", fieldIndex, missingPolicy); 3680 } 3681 3682 final override SingleFieldCalculator makeCalculator() 3683 { 3684 return new ModeCalculator(fieldIndex); 3685 } 3686 3687 class ModeCalculator : SingleFieldCalculator 3688 { 3689 private size_t[string] _valueCounts; 3690 private Appender!(string[]) _uniqueValues; 3691 3692 this(size_t fieldIndex) 3693 { 3694 super(fieldIndex); 3695 } 3696 3697 final override ModeOperator getOperator() 3698 { 3699 return this.outer; 3700 } 3701 3702 final override void processNextField(const char[] nextField) 3703 { 3704 auto countPtr = (nextField in _valueCounts); 3705 3706 if (countPtr is null) 3707 { 3708 string value = nextField.to!string; 3709 _uniqueValues.put(value); 3710 _valueCounts[value] = 1; 3711 } 3712 else 3713 { 3714 (*countPtr)++; 3715 } 3716 } 3717 3718 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3719 { 3720 string modeValue = ""; 3721 size_t modeCount = 0; 3722 3723 foreach (value; _uniqueValues.data) 3724 { 3725 assert(value in _valueCounts); 3726 3727 auto count = _valueCounts[value]; 3728 3729 if (count > modeCount) 3730 { 3731 modeValue = value; 3732 modeCount = count; 3733 } 3734 } 3735 3736 return modeValue; 3737 } 3738 } 3739 } 3740 3741 unittest // ModeOperator 3742 { 3743 auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]]; 3744 auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]]; 3745 auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]]; 3746 3747 testSingleFieldOperator!ModeOperator(col1File, 0, "mode", ["", "a", "a", "a", "c", "b", "b", "b"]); 3748 testSingleFieldOperator!ModeOperator(col2File, 0, "mode", ["", "abc", "abc", "def"]); 3749 testSingleFieldOperator!ModeOperator(col2File, 1, "mode", ["", "pqr", "pqr", "pqr"]); 3750 testSingleFieldOperator!ModeOperator(col3File, 0, "mode", ["", "1.0", "1.0", "1.0"]); 3751 testSingleFieldOperator!ModeOperator(col3File, 1, "mode", ["", "1", "1", "a"]); 3752 testSingleFieldOperator!ModeOperator(col3File, 2, "mode", ["", "a", "a", "a"]); 3753 3754 auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]]; 3755 testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "", "a", "a", "a", "a", "c", "b", "b"], 3756 new MissingFieldPolicy(true, "")); // Exclude missing 3757 3758 3759 testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "X", "X", "X", "X", "X", "X", "X", "b"], 3760 new MissingFieldPolicy(false, "X")); // Replace missing 3761 } 3762 3763 /** ModeCountOperator outputs the count of the most frequent value seen. 3764 * 3765 * All the field values are stored in memory as part of this calculation. 3766 * 3767 */ 3768 class ModeCountOperator : SingleFieldOperator 3769 { 3770 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3771 { 3772 super("mode_count", fieldIndex, missingPolicy); 3773 } 3774 3775 final override SingleFieldCalculator makeCalculator() 3776 { 3777 return new ModeCountCalculator(fieldIndex); 3778 } 3779 3780 class ModeCountCalculator : SingleFieldCalculator 3781 { 3782 private size_t[string] _valueCounts; 3783 3784 this(size_t fieldIndex) 3785 { 3786 super(fieldIndex); 3787 } 3788 3789 final override ModeCountOperator getOperator() 3790 { 3791 return this.outer; 3792 } 3793 3794 final override void processNextField(const char[] nextField) 3795 { 3796 auto countPtr = (nextField in _valueCounts); 3797 3798 if (countPtr is null) 3799 { 3800 string value = nextField.to!string; 3801 _valueCounts[value] = 1; 3802 } 3803 else 3804 { 3805 (*countPtr)++; 3806 } 3807 } 3808 3809 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3810 { 3811 size_t modeCount = 0; 3812 foreach (count; _valueCounts.byValue) if (count > modeCount) modeCount = count; 3813 return printOptions.formatNumber(modeCount); 3814 } 3815 } 3816 } 3817 3818 unittest // ModeCountOperator 3819 { 3820 auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]]; 3821 auto col2File = [["abc", ""], ["def", ""], ["def", "xyz"]]; 3822 auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]]; 3823 3824 testSingleFieldOperator!ModeCountOperator(col1File, 0, "mode_count", ["0", "1", "1", "1", "2", "2", "3", "3"]); 3825 testSingleFieldOperator!ModeCountOperator(col2File, 0, "mode_count", ["0", "1", "1", "2"]); 3826 testSingleFieldOperator!ModeCountOperator(col2File, 1, "mode_count", ["0", "1", "2", "2"]); 3827 testSingleFieldOperator!ModeCountOperator(col3File, 0, "mode_count", ["0", "1", "1", "1"]); 3828 testSingleFieldOperator!ModeCountOperator(col3File, 1, "mode_count", ["0", "1", "1", "2"]); 3829 testSingleFieldOperator!ModeCountOperator(col3File, 2, "mode_count", ["0", "1", "1", "1"]); 3830 3831 auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]]; 3832 testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "0", "1", "1", "1", "1", "2", "2", "3"], 3833 new MissingFieldPolicy(true, "")); // Exclude missing 3834 3835 3836 testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "1", "1", "2", "2", "2", "2", "2", "3"], 3837 new MissingFieldPolicy(false, "X")); // Replace missing 3838 } 3839 3840 /** ValuesOperator outputs each value delimited by an alternate delimiter character. 3841 * 3842 * All the field values are stored in memory as part of this calculation. This is 3843 * handled by unique key value lists. 3844 */ 3845 3846 class ValuesOperator : SingleFieldOperator 3847 { 3848 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3849 { 3850 super("values", fieldIndex, missingPolicy); 3851 setSaveFieldValuesText(); 3852 } 3853 3854 final override SingleFieldCalculator makeCalculator() 3855 { 3856 return new ValuesCalculator(fieldIndex); 3857 } 3858 3859 class ValuesCalculator : SingleFieldCalculator 3860 { 3861 this(size_t fieldIndex) 3862 { 3863 super(fieldIndex); 3864 } 3865 3866 final override ValuesOperator getOperator() 3867 { 3868 return this.outer; 3869 } 3870 3871 /* Work is done by saving the field values. */ 3872 final override void processNextField(const char[] nextField) 3873 { } 3874 3875 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3876 { 3877 return valuesLists.textValues(fieldIndex).join(printOptions.valuesDelimiter); 3878 } 3879 } 3880 } 3881 3882 unittest // ValuesOperator 3883 { 3884 auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]]; 3885 auto col2File = [["", "50"], ["", "51"], ["xyz", "52"]]; 3886 auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "---"]]; 3887 3888 testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e|", "a||b|cd|e||a"]); 3889 testSingleFieldOperator!ValuesOperator(col2File, 0, "values", ["", "", "|", "||xyz"]); 3890 testSingleFieldOperator!ValuesOperator(col2File, 1, "values", ["", "50", "50|51", "50|51|52"]); 3891 testSingleFieldOperator!ValuesOperator(col3File, 0, "values", ["", "z", "z|y", "z|y|w"]); 3892 testSingleFieldOperator!ValuesOperator(col3File, 1, "values", ["", "a", "a|ab", "a|ab|ba"]); 3893 testSingleFieldOperator!ValuesOperator(col3File, 2, "values", ["", "-", "-|--", "-|--|---"]); 3894 3895 testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e|a"], 3896 new MissingFieldPolicy(true, "")); // Exclude missing 3897 3898 3899 testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e|X", "a|X|b|cd|e|X|a"], 3900 new MissingFieldPolicy(false, "X")); // Replace missing 3901 } 3902 3903 /** UniqueValuesOperator outputs each unique value delimited by an alternate delimiter 3904 * character. Values are output in the order seen. 3905 * 3906 * All unique field values are stored in memory as part of this calculation. 3907 * 3908 */ 3909 class UniqueValuesOperator : SingleFieldOperator 3910 { 3911 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3912 { 3913 super("unique_values", fieldIndex, missingPolicy); 3914 } 3915 3916 final override SingleFieldCalculator makeCalculator() 3917 { 3918 return new UniqueValuesCalculator(fieldIndex); 3919 } 3920 3921 class UniqueValuesCalculator : SingleFieldCalculator 3922 { 3923 private size_t[string] _valuesHash; 3924 private Appender!(string[]) _uniqueValues; 3925 3926 this(size_t fieldIndex) 3927 { 3928 super(fieldIndex); 3929 } 3930 3931 final override UniqueValuesOperator getOperator() 3932 { 3933 return this.outer; 3934 } 3935 3936 final override void processNextField(const char[] nextField) 3937 { 3938 auto ptr = (nextField in _valuesHash); 3939 3940 if (ptr is null) 3941 { 3942 string value = nextField.to!string; 3943 _uniqueValues.put(value); 3944 _valuesHash[value] = 1; 3945 } 3946 } 3947 3948 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3949 { 3950 return _uniqueValues.data.join(printOptions.valuesDelimiter); 3951 } 3952 } 3953 } 3954 3955 unittest // UniqueValuesOperator 3956 { 3957 auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]]; 3958 auto col2File = [["", "50"], ["", "50"], ["xyz", "52"]]; 3959 auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "-"]]; 3960 3961 testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e", "a||b|cd|e"]); 3962 testSingleFieldOperator!UniqueValuesOperator(col2File, 0, "unique_values", ["", "", "", "|xyz"]); 3963 testSingleFieldOperator!UniqueValuesOperator(col2File, 1, "unique_values", ["", "50", "50", "50|52"]); 3964 testSingleFieldOperator!UniqueValuesOperator(col3File, 0, "unique_values", ["", "z", "z|y", "z|y|w"]); 3965 testSingleFieldOperator!UniqueValuesOperator(col3File, 1, "unique_values", ["", "a", "a|ab", "a|ab|ba"]); 3966 testSingleFieldOperator!UniqueValuesOperator(col3File, 2, "unique_values", ["", "-", "-|--", "-|--"]); 3967 3968 testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e"], 3969 new MissingFieldPolicy(true, "")); // Exclude missing 3970 3971 3972 testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e", "a|X|b|cd|e"], 3973 new MissingFieldPolicy(false, "X")); // Replace missing 3974 }