1 /** 2 Command line tool that reads TSV files and summarizes field values associated with 3 equivalent keys. 4 5 Copyright (c) 2016-2018, eBay Software Foundation 6 Initially written by Jon Degenhardt 7 8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 9 */ 10 module tsv_summarize; 11 12 import std.algorithm : all, any, canFind, each, find, findSplit, map, joiner, splitter; 13 import std.array : join; 14 import std.conv : to; 15 import std.format : format; 16 import std.range; 17 import std.stdio; 18 import std.typecons : tuple; 19 import std.container : DList; 20 21 version(unittest) 22 { 23 // When running unit tests, use main from -main compiler switch. 24 } 25 else 26 { 27 int main(string[] cmdArgs) 28 { 29 /* When running in DMD code coverage mode, turn on report merging. */ 30 version(D_Coverage) version(DigitalMars) 31 { 32 import core.runtime : dmd_coverSetMerge; 33 dmd_coverSetMerge(true); 34 } 35 36 TsvSummarizeOptions cmdopt; 37 auto r = cmdopt.processArgs(cmdArgs); 38 if (!r[0]) return r[1]; 39 version(LDC_Profile) 40 { 41 import ldc.profile : resetAll; 42 resetAll(); 43 } 44 try tsvSummarize(cmdopt, cmdArgs[1..$]); 45 catch (Exception exc) 46 { 47 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 48 return 1; 49 } 50 return 0; 51 } 52 } 53 54 auto helpTextVerbose = q"EOS 55 Synopsis: tsv-summarize [options] file [file...] 56 57 tsv-summarize reads tabular data files (tab-separated by default), tracks 58 field values for each unique key, and runs summarization algorithms. Consider 59 the file data.tsv: 60 61 make color time 62 ford blue 131 63 chevy green 124 64 ford red 128 65 bmw black 118 66 bmw black 126 67 ford blue 122 68 69 The min and average times for each make is generated by the command: 70 71 $ tsv-summarize --header --group-by 1 --min 3 --mean 3 data.tsv 72 73 This produces: 74 75 make time_min time_mean 76 ford 122 127 77 chevy 124 124 78 bmw 118 122 79 80 Using '--group 1,2' will group by both 'make' and 'color'. Omitting the 81 '--group-by' entirely summarizes fields for full file. 82 83 The program tries to generate useful headers, but custom headers can be 84 specified. Example (using -g and -H shortcuts for --header and --group-by): 85 86 $ tsv-summarize -H -g 1 --min 3:fastest --mean 3:average data.tsv 87 88 Most operators take custom headers in a similarly way, generally following: 89 90 --<operator-name> FIELD[:header] 91 92 Operators can be specified multiple times. They can also take multiple 93 fields (though not when a custom header is specified). Examples: 94 95 --median 2,3,4 96 --median 2-5,7-11 97 98 The quantile operator requires one or more probabilities after the fields: 99 100 --quantile 2:0.25 // Quantile 1 of field 2 101 --quantile 2-4:0.25,0.5,0.75 // Q1, Median, Q3 of fields 2, 3, 4 102 103 Summarization operators available are: 104 count range mad values 105 retain sum var unique-values 106 first mean stddev unique-count 107 last median mode missing-count 108 min quantile mode-count not-missing-count 109 max 110 111 Numeric values are printed to 12 significant digits by default. This can be 112 changed using the '--p|float-precision' option. If six or less it sets the 113 number of significant digits after the decimal point. If greater than six it 114 sets the total number of significant digits. 115 116 Calculations hold onto the minimum data needed while reading data. A few 117 operations like median keep all data values in memory. These operations will 118 start to encounter performance issues as available memory becomes scarce. The 119 size that can be handled effectively is machine dependent, but often quite 120 large files can be handled. 121 122 Operations requiring numeric entries will signal an error and terminate 123 processing if a non-numeric entry is found. 124 125 Missing values are not treated specially by default, this can be changed 126 using the '--x|exclude-missing' or '--r|replace-missing' option. The former 127 turns off processing for missing values, the latter uses a replacement value. 128 129 Options: 130 EOS"; 131 132 auto helpText = q"EOS 133 Synopsis: tsv-summarize [options] file [file...] 134 135 tsv-summarize runs aggregation operations on fields in tab-separated value 136 files. Operations can be run against the full input data or grouped by key 137 fields. Use --help-verbose for more extensive help. 138 139 Options: 140 EOS"; 141 142 /** Command line options - Container and processing. The processArgs method is used to 143 * process the command line. 144 */ 145 struct TsvSummarizeOptions { 146 string programName; 147 148 /* Options set directly by on the command line.. */ 149 size_t[] keyFields; // -g, --group-by 150 bool hasHeader = false; // --header 151 bool writeHeader = false; // -w, --write-header 152 char inputFieldDelimiter = '\t'; // --d|delimiter 153 char valuesDelimiter = '|'; // --v|values-delimiter 154 size_t floatPrecision = 12; // --p|float-precision 155 bool excludeMissing = false; // --x|exclude-missing 156 string missingValueReplacement; // --r|replace-missing 157 bool helpVerbose = false; // --help-verbose 158 bool versionWanted = false; // --V|version 159 DList!Operator operators; // Operators, in the order specified. 160 size_t endFieldIndex = 0; // Derived value. Max field index used plus one. 161 MissingFieldPolicy globalMissingPolicy = new MissingFieldPolicy; // Derived value. 162 163 /* Returns a tuple. First value is true if command line arguments were successfully 164 * processed and execution should continue, or false if an error occurred or the user 165 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 166 * 167 * Returning true (execution continues) means args have been validated and derived 168 * values calculated. In addition, field indices have been converted to zero-based. 169 */ 170 auto processArgs (ref string[] cmdArgs) { 171 import std.algorithm : any, each; 172 import std.getopt; 173 import std.path : baseName, stripExtension; 174 import std.typecons : Yes, No; 175 import getopt_inorder; 176 import tsvutil : makeFieldListOptionHandler; 177 178 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 179 180 try 181 { 182 arraySep = ","; // Use comma to separate values in command line options 183 auto r = getoptInorder( 184 cmdArgs, 185 "help-verbose", " Print full help.", &helpVerbose, 186 187 std.getopt.config.caseSensitive, 188 "V|version", " Print version information and exit.", &versionWanted, 189 std.getopt.config.caseInsensitive, 190 191 "g|group-by", "<field-list> Fields to use as key.", 192 keyFields.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex), 193 194 std.getopt.config.caseSensitive, 195 "H|header", " Treat the first line of each file as a header.", &hasHeader, 196 std.getopt.config.caseInsensitive, 197 198 "w|write-header", " Write an output header even if there is no input header.", &writeHeader, 199 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &inputFieldDelimiter, 200 "v|values-delimiter", "CHR Values delimiter. Default: vertical bar (|). (Single byte UTF-8 characters only.)", &valuesDelimiter, 201 "p|float-precision", "NUM 'Precision' to use printing floating point numbers. Affects the number of digits printed and exponent use. Default: 12", &floatPrecision, 202 "x|exclude-missing", " Exclude missing (empty) fields from calculations.", &excludeMissing, 203 "r|replace-missing", "STR Replace missing (empty) fields with STR in calculations.", &missingValueReplacement, 204 "count", " Count occurrences of each unique key.", &countOptionHandler, 205 "count-header", "STR Count occurrences of each unique key, use header STR.", &countHeaderOptionHandler, 206 "retain", "<field-list> Retain one copy of the field.", &operatorOptionHandler!RetainOperator, 207 "first", "<field-list>[:STR] First value seen.", &operatorOptionHandler!FirstOperator, 208 "last", "<field-list>[:STR] Last value seen.", &operatorOptionHandler!LastOperator, 209 "min", "<field-list>[:STR] Min value. (Numeric fields only.)", &operatorOptionHandler!MinOperator, 210 "max", "<field-list>[:STR] Max value. (Numeric fields only.)", &operatorOptionHandler!MaxOperator, 211 "range", "<field-list>[:STR] Difference between min and max values. (Numeric fields only.)", &operatorOptionHandler!RangeOperator, 212 "sum", "<field-list>[:STR] Sum of the values. (Numeric fields only.)", &operatorOptionHandler!SumOperator, 213 "mean", "<field-list>[:STR] Mean (average). (Numeric fields only.)", &operatorOptionHandler!MeanOperator, 214 "median", "<field-list>[:STR] Median value. (Numeric fields only. Reads all values into memory.)", &operatorOptionHandler!MedianOperator, 215 "quantile", "<field-list>:p[,p...][:STR] Quantiles. One or more fields, then one or more 0.0-1.0 probabilities. (Numeric fields only. Reads all values into memory.)", &quantileOperatorOptionHandler, 216 "mad", "<field-list>[:STR] Median absolute deviation from the median. Raw value, not scaled. (Numeric fields only. Reads all values into memory.)", &operatorOptionHandler!MadOperator, 217 "var", "<field-list>[:STR] Variance. (Sample variance, numeric fields only).", &operatorOptionHandler!VarianceOperator, 218 "stdev", "<field-list>[:STR] Standard deviation. (Sample st.dev, numeric fields only).", &operatorOptionHandler!StDevOperator, 219 "mode", "<field-list>[:STR] Mode. The most frequent value. (Reads all unique values into memory.)", &operatorOptionHandler!ModeOperator, 220 "mode-count", "<field-list>[:STR] Count of the most frequent value. (Reads all unique values into memory.)", &operatorOptionHandler!ModeCountOperator, 221 "unique-count", "<field-list>[:STR] Number of unique values. (Reads all unique values into memory.)", &operatorOptionHandler!UniqueCountOperator, 222 "missing-count", "<field-list>[:STR] Number of missing (empty) fields. Not affected by '--x|exclude-missing' or '--r|replace-missing'.", &operatorOptionHandler!MissingCountOperator, 223 "not-missing-count", "<field-list>[:STR] Number of filled (non-empty) fields. Not affected by '--r|replace-missing'.", &operatorOptionHandler!NotMissingCountOperator, 224 "values", "<field-list>[:STR] All the values, separated by --v|values-delimiter. (Reads all values into memory.)", &operatorOptionHandler!ValuesOperator, 225 "unique-values", "<field-list>[:STR] All the unique values, separated by --v|values-delimiter. (Reads all unique values into memory.)", &operatorOptionHandler!UniqueValuesOperator, 226 ); 227 228 if (r.helpWanted) 229 { 230 defaultGetoptPrinter(helpText, r.options); 231 return tuple(false, 0); 232 } 233 else if (helpVerbose) 234 { 235 defaultGetoptPrinter(helpTextVerbose, r.options); 236 return tuple(false, 0); 237 } 238 else if (versionWanted) 239 { 240 import tsvutils_version; 241 writeln(tsvutilsVersionNotice("tsv-summarize")); 242 return tuple(false, 0); 243 } 244 245 consistencyValidations(); 246 derivations(); 247 } 248 catch (Exception exc) 249 { 250 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 251 return tuple(false, 1); 252 } 253 return tuple(true, 0); 254 } 255 256 /* operationOptionHandler functions are callbacks that process command line options 257 * specifying summarization operations. eg. '--max 5', '--last 3:LastEntry'. Handlers 258 * check syntactic correctness and instantiate Operator objects that do the work. This 259 * is also where 1-upped field numbers are converted to 0-based indices. 260 */ 261 private void operatorOptionHandler(OperatorClass : SingleFieldOperator)(string option, string optionVal) 262 { 263 import std.range : enumerate; 264 import std.typecons : Yes, No; 265 import tsvutil : parseFieldList; 266 267 auto valSplit = findSplit(optionVal, ":"); 268 269 if (valSplit[0].empty || (!valSplit[1].empty && valSplit[2].empty)) 270 { 271 throw new Exception( 272 format("Invalid option value: '--%s %s'. Expected: '--%s <field-list>' or '--%s <field>:<header>'.", 273 option, optionVal, option, option)); 274 } 275 276 try foreach (fieldNum, fieldIndex; 277 valSplit[0].to!string 278 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex).enumerate(1)) 279 { 280 auto op = new OperatorClass(fieldIndex, globalMissingPolicy); 281 282 if (!valSplit[2].empty) // Header specified 283 { 284 if (fieldNum > 1) 285 { 286 throw new Exception( 287 format("Invalid option: '--%s %s'. Cannot specify a custom header when using multiple fields.", 288 option, optionVal)); 289 } 290 else if (!op.allowCustomHeader) 291 { 292 throw new Exception( 293 format("Invalid option: '--%s %s'. Operator does not support custom headers.", 294 option, optionVal)); 295 } 296 297 op.setCustomHeader(valSplit[2].to!string); 298 } 299 300 operators.insertBack(op); 301 if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1; 302 } 303 catch (Exception exc) 304 { 305 import std.format : format; 306 exc.msg = format("[--%s] %s", option, exc.msg); 307 throw exc; 308 } 309 } 310 311 /* QuantileOperator has a different syntax and needs a custom command option handler. */ 312 private void quantileOperatorOptionHandler(string option, string optionVal) 313 { 314 import std.typecons : Yes, No; 315 import tsvutil : parseFieldList; 316 317 auto formatErrorMsg(string option, string optionVal) 318 { 319 return format( 320 "Invalid option value: '--%s %s'. Expected: '--%s <field-list>:<prob>[,<prob>]' or '--%s <field>:<prob>:<header>' where <prob> is a number between 0.0 and 1.0.", 321 option, optionVal, option, option); 322 } 323 324 auto split1 = findSplit(optionVal, ":"); 325 326 if (split1[0].empty || (!split1[1].empty && split1[2].empty)) 327 throw new Exception(formatErrorMsg(option, optionVal)); 328 329 auto split2 = findSplit(split1[2], ":"); 330 331 if (split2[0].empty || (!split2[1].empty && split2[2].empty)) 332 throw new Exception(formatErrorMsg(option, optionVal)); 333 334 auto fieldStr = split1[0]; 335 auto probStr = split2[0]; 336 auto header = split2[2]; 337 338 size_t[] fieldIndices; 339 double[] probs; 340 341 try foreach (fieldIndex; 342 fieldStr.to!string.parseFieldList!(size_t, Yes.convertToZeroBasedIndex)) 343 { 344 fieldIndices ~= fieldIndex; 345 } 346 catch (Exception exc) 347 { 348 import std.format : format; 349 exc.msg = format("[--%s] %s", option, exc.msg); 350 throw exc; 351 } 352 353 foreach (str; probStr.splitter(',')) 354 { 355 double p; 356 357 try p = str.to!double; 358 catch (Exception exc) 359 throw new Exception(formatErrorMsg(option, optionVal)); 360 361 if (!(p >= 0.0 && p <= 1.0)) 362 throw new Exception( 363 format("Invalid option: '--%s %s'. Probability '%g' is not in the interval [0.0,1.0].", 364 option, optionVal, p)); 365 366 probs ~= p; 367 } 368 369 if (!header.empty && (fieldIndices.length > 1 || probs.length > 1)) 370 { 371 throw new Exception( 372 format("Invalid option: '--%s %s'. Cannot specify a custom header when using multiple fields or multiple probabilities.", 373 option, optionVal)); 374 } 375 376 assert (fieldIndices.length > 0); 377 assert (probs.length > 0); 378 assert (header.empty || (fieldIndices.length == 1 && probs.length == 1)); 379 380 foreach (fieldIndex; fieldIndices) 381 { 382 foreach (p; probs) 383 { 384 auto op = new QuantileOperator(fieldIndex, globalMissingPolicy, p); 385 if (!header.empty) op.setCustomHeader(header); 386 operators.insertBack(op); 387 } 388 if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1; 389 } 390 } 391 392 private void countOptionHandler() 393 { 394 operators.insertBack(new CountOperator()); 395 } 396 397 private void countHeaderOptionHandler(string option, string optionVal) 398 { 399 auto op = new CountOperator(); 400 op.setCustomHeader(optionVal); 401 operators.insertBack(op); 402 } 403 404 /* This routine does validations not handled by processArgs. */ 405 private void consistencyValidations() 406 { 407 if (operators.empty) 408 { 409 throw new Exception("At least one summary operator is required."); 410 } 411 412 if (inputFieldDelimiter == valuesDelimiter) 413 { 414 throw new Exception("Cannot use the same character for both --d|field-delimiter and --v|values-delimiter."); 415 } 416 417 if (excludeMissing && missingValueReplacement.length != 0) 418 { 419 throw new Exception("Cannot use both '--x|exclude-missing' and '--r|replace-missing'."); 420 } 421 } 422 423 /* Post-processing derivations. */ 424 void derivations() 425 { 426 /* keyFields need to part of the endFieldIndex, which is one past the last field index. */ 427 keyFields.each!(delegate (size_t x) { if (x >= endFieldIndex) endFieldIndex = x + 1; } ); 428 429 /* Missing field policy. */ 430 globalMissingPolicy.updatePolicy(excludeMissing, missingValueReplacement); 431 } 432 } 433 434 /** tsvSummarize does the primary work of the tsv-summarize program. 435 */ 436 void tsvSummarize(TsvSummarizeOptions cmdopt, in string[] inputFiles) 437 { 438 import tsvutil : throwIfWindowsNewlineOnUnix; 439 440 /* Pick the Summarizer based on the number of key-fields entered. */ 441 auto summarizer = 442 (cmdopt.keyFields.length == 0) 443 ? new NoKeySummarizer!(typeof(stdout.lockingTextWriter()))( 444 cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 445 446 : (cmdopt.keyFields.length == 1) 447 ? new OneKeySummarizer!(typeof(stdout.lockingTextWriter()))( 448 cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 449 450 : new MultiKeySummarizer!(typeof(stdout.lockingTextWriter()))( 451 cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy); 452 453 /* Add the operators to the Summarizer. */ 454 summarizer.setOperators(inputRangeObject(cmdopt.operators[])); 455 456 /* Process each input file, one line at a time. */ 457 auto lineFields = new char[][](cmdopt.endFieldIndex); 458 bool headerFound = false; 459 foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"]) 460 { 461 auto inputStream = (filename == "-") ? stdin : filename.File(); 462 foreach (lineNum, line; inputStream.byLine.enumerate(1)) 463 { 464 if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, lineNum); 465 466 /* Copy the needed number of fields to the fields array. 467 * Note: The number is zero if no operator needs fields. Notably, the count 468 * operator. Used by itself, it counts the number input lines (ala 'wc -l'). 469 */ 470 if (cmdopt.endFieldIndex > 0) 471 { 472 size_t fieldIndex = 0; 473 foreach (fieldValue; line.splitter(cmdopt.inputFieldDelimiter)) 474 { 475 if (fieldIndex == cmdopt.endFieldIndex) break; 476 lineFields[fieldIndex] = fieldValue; 477 fieldIndex++; 478 } 479 480 if (fieldIndex == 0) 481 { 482 assert(cmdopt.endFieldIndex > 0); 483 assert(line.length == 0); 484 485 /* Bug work-around. Empty lines are not handled properly by splitter. 486 * - Bug: https://issues.dlang.org/show_bug.cgi?id=15735 487 * - Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030 488 * This can arise for: '$ tsv-summarize -g 1 --count'. This counts the 489 * unique values in field 1. If there's only one column, then an empty 490 * line becomes an empty string for field 1. Work-around: Point to the 491 * line. It's an empty string. 492 */ 493 lineFields[fieldIndex] = line; 494 fieldIndex++; 495 } 496 497 if (fieldIndex < cmdopt.endFieldIndex) 498 { 499 throw new Exception( 500 format("Not enough fields in line. File: %s, Line: %s", 501 (filename == "-") ? "Standard Input" : filename, lineNum)); 502 } 503 } 504 505 if (cmdopt.hasHeader && lineNum == 1) 506 { 507 if (!headerFound) 508 { 509 summarizer.processHeaderLine(lineFields); 510 headerFound = true; 511 } 512 } 513 else 514 { 515 /* Process the line. Processing will fail (throw) if a field cannot be 516 * converted to the expected type. 517 */ 518 try summarizer.processNextLine(lineFields); 519 catch (Exception exc) 520 { 521 throw new Exception( 522 format("Could not process line or field: %s\n File: %s Line: %s%s", 523 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum, 524 (lineNum == 1) ? "\n Is this a header line? Use --header to skip." : "")); 525 } 526 } 527 } 528 } 529 530 debug writeln("[tsvSummarize] After reading all data."); 531 532 /* Whew! We're done processing input data. Run the calculations and print. */ 533 auto printOptions = SummarizerPrintOptions( 534 cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision); 535 auto stdoutWriter = stdout.lockingTextWriter; 536 537 if (cmdopt.hasHeader || cmdopt.writeHeader) 538 { 539 summarizer.writeSummaryHeader(stdoutWriter, printOptions); 540 } 541 542 summarizer.writeSummaryBody(stdoutWriter, printOptions); 543 } 544 545 /* The default field header. This is used when the input doesn't have field headers, 546 * but field headers are used in the output. The default is "fieldN", where N is the 547 * 1-upped field number. 548 */ 549 string fieldHeaderFromIndex(size_t fieldIndex) 550 { 551 enum prefix = "field"; 552 return prefix ~ (fieldIndex + 1).to!string; 553 } 554 555 unittest 556 { 557 assert(fieldHeaderFromIndex(0) == "field1"); 558 assert(fieldHeaderFromIndex(10) == "field11"); 559 } 560 561 /* Produce a summary header from a field header. The result has the form 562 * "<fieldHeader>_<operation>". e.g. If the field header is "length" and the operation is 563 * "max", the summary header is "length_max". The field header typically comes a 564 * header line in the input data or was constructed by fieldHeaderFromIndex(). 565 * 566 * If operationName is the empty string, then fieldHeader is used unchanged. This supports 567 * the Retain operator. 568 */ 569 string summaryHeaderFromFieldHeader(string fieldHeader, string operationName) 570 { 571 return (operationName.length > 0) ? fieldHeader ~ "_" ~ operationName : fieldHeader; 572 } 573 574 unittest 575 { 576 assert(summaryHeaderFromFieldHeader("originalfield", "mycalc") == "originalfield_mycalc"); 577 assert(summaryHeaderFromFieldHeader("originalfield", "") == "originalfield"); 578 } 579 580 /* SummarizerPrintOptions holds printing options for Summarizers and Calculators. Typically 581 * specified with command line options, it is separated out for modularity. 582 */ 583 struct SummarizerPrintOptions 584 { 585 char fieldDelimiter; 586 char valuesDelimiter; 587 size_t floatPrecision = 12; 588 589 import std.traits : isFloatingPoint, isIntegral; 590 auto formatNumber(T)(T n) const 591 if (isFloatingPoint!T || isIntegral!T) 592 { 593 import tsv_numerics : formatNumber; 594 return formatNumber!T(n, floatPrecision); 595 } 596 } 597 598 /* A Summarizer maintains the state of the summarization and performs basic processing. 599 * Handling of files and input lines is left to the caller. 600 * API: 601 * - addOperator - Called after initializing the object for each operator to be processed. 602 * - processHeaderLine - Called to process the header line of each file. Returns true if 603 * it was the first header line processed (used when reading multiple files). 604 * - processNextLine - Called to process non-header lines. 605 * - writeSummaryHeader - Called to write the header line. 606 * - writeSummaryBody - Called to write the result lines. 607 */ 608 interface Summarizer(OutputRange) 609 { 610 void setOperators(InputRange!Operator op); 611 bool processHeaderLine(const char[][] lineFields); 612 void processNextLine(const char[][] lineFields); 613 void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions); 614 void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions); 615 } 616 617 /* SummarizerBase performs work shared by all sumarizers, most everything except for 618 * handling of unique keys. The base class handles creation, allocates storage for 619 * Operators and SharedFieldValues, and similar. Derived classes deal primarily with 620 * unique keys and the associated Calculators and UniqueKeyValuesLists. 621 */ 622 class SummarizerBase(OutputRange) : Summarizer!OutputRange 623 { 624 private char _inputFieldDelimiter; 625 private bool _hasProcessedFirstHeaderLine = false; 626 private SharedFieldValues _sharedFieldValues = null; // Null if no shared field value lists. 627 protected MissingFieldPolicy _missingPolicy; 628 protected DList!Operator _operators; 629 protected size_t _numOperators = 0; 630 631 this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 632 { 633 _inputFieldDelimiter = inputFieldDelimiter; 634 _missingPolicy = missingPolicy; 635 } 636 637 char inputFieldDelimiter() const @property 638 { 639 return _inputFieldDelimiter; 640 } 641 642 /* Sets the Operators used by the Summarizer. Called after construction. */ 643 void setOperators(InputRange!Operator operators) 644 { 645 foreach (op; operators) 646 { 647 _operators.insertBack(op); 648 _numOperators++; 649 auto numericFieldsToSave = op.numericFieldsToSave(); 650 auto textFieldsToSave = op.textFieldsToSave(); 651 652 if (numericFieldsToSave.length > 0 || textFieldsToSave.length > 0) 653 { 654 if (_sharedFieldValues is null) 655 { 656 _sharedFieldValues = new SharedFieldValues(); 657 } 658 numericFieldsToSave.each!(x => _sharedFieldValues.addNumericIndex(x)); 659 textFieldsToSave.each!(x => _sharedFieldValues.addTextIndex(x)); 660 } 661 } 662 } 663 664 bool processHeaderLine(const char[][] lineFields) 665 { 666 if (!_hasProcessedFirstHeaderLine) 667 { 668 _operators.each!(x => x.processHeaderLine(lineFields)); 669 _hasProcessedFirstHeaderLine = true; 670 return true; 671 } 672 else 673 { 674 return false; 675 } 676 } 677 678 protected final UniqueKeyValuesLists makeUniqueKeyValuesLists() 679 { 680 return (_sharedFieldValues is null) 681 ? null 682 : _sharedFieldValues.makeUniqueKeyValuesLists; 683 } 684 685 abstract void processNextLine(const char[][] lineFields); 686 abstract void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions); 687 abstract void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions); 688 } 689 690 /* The NoKeySummarizer is used when summarizing values across the entire input. 691 * 692 * Note: NoKeySummarizer is used in Operator unit tests and gets extensive testing 693 * through that mechanism. 694 */ 695 class NoKeySummarizer(OutputRange) : SummarizerBase!OutputRange 696 { 697 private Calculator[] _calculators; 698 private UniqueKeyValuesLists _valueLists; 699 700 this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 701 { 702 super(inputFieldDelimiter, missingPolicy); 703 } 704 705 /* Only one Calculator per Operation, so create them as Operators are added. */ 706 override void setOperators(InputRange!Operator operators) 707 { 708 super.setOperators(operators); 709 foreach (op; operators) _calculators ~= op.makeCalculator; 710 _valueLists = super.makeUniqueKeyValuesLists(); 711 } 712 713 override void processNextLine(const char[][] lineFields) 714 { 715 _calculators.each!(x => x.processNextLine(lineFields)); 716 if (_valueLists !is null) _valueLists.processNextLine(lineFields, _missingPolicy); 717 } 718 719 override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 720 { 721 put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter)); 722 put(outputStream, '\n'); 723 } 724 725 override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 726 { 727 put(outputStream, 728 _calculators[] 729 .map!(x => x.calculate(_valueLists, printOptions)) 730 .join(printOptions.fieldDelimiter)); 731 put(outputStream, '\n'); 732 } 733 } 734 735 /* KeySummarizerBase does work shared by the single key and multi-key summarizers. The 736 * primary difference between those two is the formation of the key. The primary reason 737 * for separating those into two separate classes is to simplify (speed-up) handling of 738 * single field keys, which are the most common use case. 739 */ 740 class KeySummarizerBase(OutputRange) : SummarizerBase!OutputRange 741 { 742 protected struct UniqueKeyData 743 { 744 Calculator[] calculators; 745 UniqueKeyValuesLists valuesLists; 746 } 747 748 private DList!string _uniqueKeys; 749 private UniqueKeyData[string] _uniqueKeyData; 750 751 this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 752 { 753 super(inputFieldDelimiter, missingPolicy); 754 } 755 756 protected void processNextLineWithKey(T : const char[])(T key, const char[][] lineFields) 757 { 758 debug writefln("[%s]: %s", __FUNCTION__, lineFields.to!string); 759 760 auto dataPtr = (key in _uniqueKeyData); 761 auto data = (dataPtr is null) ? addUniqueKey(key.to!string) : *dataPtr; 762 763 data.calculators.each!(x => x.processNextLine(lineFields)); 764 if (data.valuesLists !is null) data.valuesLists.processNextLine(lineFields, _missingPolicy); 765 } 766 767 protected UniqueKeyData addUniqueKey(string key) 768 { 769 assert(key !in _uniqueKeyData); 770 771 _uniqueKeys.insertBack(key); 772 773 auto calculators = new Calculator[_numOperators]; 774 size_t i = 0; 775 foreach (op; _operators) 776 { 777 calculators[i] = op.makeCalculator; 778 i++; 779 } 780 781 return _uniqueKeyData[key] = UniqueKeyData(calculators, super.makeUniqueKeyValuesLists()); 782 } 783 784 override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 785 { 786 put(outputStream, keyFieldHeader()); 787 put(outputStream, printOptions.fieldDelimiter); 788 put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter)); 789 put(outputStream, '\n'); 790 } 791 792 override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 793 { 794 foreach(key; _uniqueKeys) 795 { 796 auto data = _uniqueKeyData[key]; 797 put(outputStream, key); 798 put(outputStream, printOptions.fieldDelimiter); 799 put(outputStream, 800 data.calculators[] 801 .map!(x => x.calculate(data.valuesLists, printOptions)) 802 .join(printOptions.fieldDelimiter)); 803 put(outputStream, '\n'); 804 } 805 } 806 807 abstract string keyFieldHeader() const @property; 808 } 809 810 /* This Summarizer is for the case where the unique key is based on exactly one field. 811 */ 812 class OneKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange 813 { 814 private size_t _keyFieldIndex = 0; 815 private string _keyFieldHeader; 816 private DList!string _uniqueKeys; 817 818 this(size_t keyFieldIndex, char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 819 { 820 super(inputFieldDelimiter, missingPolicy); 821 _keyFieldIndex = keyFieldIndex; 822 _keyFieldHeader = fieldHeaderFromIndex(keyFieldIndex); 823 } 824 825 override string keyFieldHeader() const @property 826 { 827 return _keyFieldHeader; 828 } 829 830 override bool processHeaderLine(const char[][] lineFields) 831 { 832 assert(_keyFieldIndex <= lineFields.length); 833 834 bool isFirstHeaderLine = super.processHeaderLine(lineFields); 835 if (isFirstHeaderLine) 836 { 837 _keyFieldHeader = lineFields[_keyFieldIndex].to!string; 838 } 839 return isFirstHeaderLine; 840 } 841 842 override void processNextLine(const char[][] lineFields) 843 { 844 assert(_keyFieldIndex < lineFields.length); 845 processNextLineWithKey(lineFields[_keyFieldIndex], lineFields); 846 } 847 } 848 849 /* This Summarizer is for the case where the unique key is based on multiple fields. 850 */ 851 class MultiKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange 852 { 853 private size_t[] _keyFieldIndices; 854 private string _keyFieldHeader; 855 private DList!string _uniqueKeys; 856 857 this(const size_t[] keyFieldIndices, char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 858 { 859 super(inputFieldDelimiter, missingPolicy); 860 _keyFieldIndices = keyFieldIndices.dup; 861 _keyFieldHeader = 862 _keyFieldIndices.map!(i => fieldHeaderFromIndex(i)) 863 .join(inputFieldDelimiter); 864 } 865 866 override string keyFieldHeader() const @property 867 { 868 return _keyFieldHeader; 869 } 870 871 override bool processHeaderLine(const char[][] lineFields) 872 { 873 assert(_keyFieldIndices.all!(x => x < lineFields.length)); 874 assert(_keyFieldIndices.length >= 2); 875 876 bool isFirstHeaderLine = super.processHeaderLine(lineFields); 877 if (isFirstHeaderLine) 878 { 879 _keyFieldHeader = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string; 880 } 881 return isFirstHeaderLine; 882 } 883 884 override void processNextLine(const char[][] lineFields) 885 { 886 assert(_keyFieldIndices.all!(x => x < lineFields.length)); 887 assert(_keyFieldIndices.length >= 2); 888 889 string key = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string; 890 processNextLineWithKey(key, lineFields); 891 } 892 } 893 894 version(unittest) 895 { 896 /* testSummarizer is a helper that can run many types of unit tests against 897 * Summarizers. It can also test operators, but there are separate helper functions 898 * better suited for that purpose. 899 * 900 * Arguments are a command line args, an input file, and expected output. The 901 * input file and expected output are already split into lines and fields, the helper 902 * manages re-assembly. The program name from the command line args is printed if an 903 * an error occurs, it is useful to identify the test that failed. 904 * 905 * Note: Much of this is a duplication tsvSummarize logic. Better abstraction of 906 * file input/output would enable running unit tests directly on top of tsvSummarize. 907 */ 908 void testSummarizer(string[] cmdArgs, string[][] file, string[][] expected) 909 { 910 import std.array : appender; 911 912 assert(cmdArgs.length > 0, "[testSummarizer] cmdArgs must not be empty."); 913 914 auto formatAssertMessage(T...)(string msg, T formatArgs) 915 { 916 auto formatString = "[testSummarizer] %s: " ~ msg; 917 return format(formatString, cmdArgs[0], formatArgs); 918 } 919 920 TsvSummarizeOptions cmdopt; 921 auto savedCmdArgs = cmdArgs.to!string; 922 auto r = cmdopt.processArgs(cmdArgs); 923 assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); 924 925 assert(file.all!(line => line.length >= cmdopt.endFieldIndex), 926 formatAssertMessage("group-by or operator field number greater than number of fields a line of the input file.")); 927 928 /* Pick the Summarizer based on the number of key-fields entered. */ 929 auto summarizer = 930 (cmdopt.keyFields.length == 0) 931 ? new NoKeySummarizer!(typeof(appender!(char[])()))( 932 cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 933 934 : (cmdopt.keyFields.length == 1) 935 ? new OneKeySummarizer!(typeof(appender!(char[])()))( 936 cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 937 938 : new MultiKeySummarizer!(typeof(appender!(char[])()))( 939 cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy); 940 941 /* Add the operators to the Summarizer. */ 942 summarizer.setOperators(inputRangeObject(cmdopt.operators[])); 943 944 /* Process the file one line at a time. */ 945 auto lineFields = new char[][](cmdopt.endFieldIndex); 946 bool headerFound = false; 947 foreach (lineNum, line; file.enumerate(1)) 948 { 949 /* Copy the needed fields to the fields array. */ 950 foreach (i, val; line[0..cmdopt.endFieldIndex]) lineFields[i] = val.dup; 951 952 if (cmdopt.hasHeader && lineNum == 1) 953 { 954 if (!headerFound) 955 { 956 summarizer.processHeaderLine(lineFields); 957 headerFound = true; 958 } 959 } 960 else 961 { 962 try summarizer.processNextLine(lineFields); 963 catch (Exception exc) 964 { 965 assert(false, formatAssertMessage(exc.msg)); 966 } 967 } 968 } 969 auto printOptions = SummarizerPrintOptions( 970 cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision); 971 972 auto summarizerOutput = appender!(char[])(); 973 974 if (cmdopt.hasHeader || cmdopt.writeHeader) 975 { 976 summarizer.writeSummaryHeader(summarizerOutput, printOptions); 977 } 978 979 summarizer.writeSummaryBody(summarizerOutput, printOptions); 980 auto expectedOutput = expected.map!(x => x.joiner(cmdopt.inputFieldDelimiter.to!string)).joiner("\n").to!string; 981 if (expectedOutput.length > 0 && expectedOutput[$-1] != '\n') expectedOutput ~= "\n"; 982 983 assert(summarizerOutput.data == expectedOutput, 984 formatAssertMessage( 985 "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================", 986 expectedOutput.to!string, summarizerOutput.data.to!string)); 987 } 988 } 989 990 unittest 991 { 992 /* Summarizer unit tests. Primarily single-key and multi-key summarizers. To a limited 993 * extent, command line option handling (TsvSummarizeOptions). Individual operators 994 * have separate tests, those tests test the no-key summarizer. The Values operator is 995 * used in these tests. It engages a number of behaviors, and the results have limited 996 * ambiguity. Using only one operator limits dependence on individual operators. 997 */ 998 999 auto file1 = [["fld1", "fld2", "fld3"], 1000 ["a", "a", "3"], 1001 ["c", "a", "2b"], 1002 ["c", "bc", ""], 1003 ["a", "c", "2b"], 1004 ["", "bc", ""], 1005 ["c", "bc", "3"]]; 1006 1007 /* Single-key summarizer tests. 1008 */ 1009 testSummarizer(["unittest-sk-1", "--header", "--group-by", "1", "--values", "1"], 1010 file1, 1011 [["fld1", "fld1_values"], 1012 ["a", "a|a"], 1013 ["c", "c|c|c"], 1014 ["", ""]] 1015 ); 1016 testSummarizer(["unittest-sk-2", "-H", "--group-by", "1", "--values", "2"], 1017 file1, 1018 [["fld1", "fld2_values"], 1019 ["a", "a|c"], 1020 ["c", "a|bc|bc"], 1021 ["", "bc"]] 1022 ); 1023 testSummarizer(["unittest-sk-3", "-H", "-g", "1", "--values", "3"], 1024 file1, 1025 [["fld1", "fld3_values"], 1026 ["a", "3|2b"], 1027 ["c", "2b||3"], 1028 ["", ""]] 1029 ); 1030 testSummarizer(["unittest-sk-4", "-H", "--group-by", "1", "--values", "1,2,3"], 1031 file1, 1032 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1033 ["a", "a|a", "a|c", "3|2b"], 1034 ["c", "c|c|c", "a|bc|bc", "2b||3"], 1035 ["", "", "bc", ""]] 1036 ); 1037 testSummarizer(["unittest-sk-5", "-H", "--group-by", "1", "--values", "1-3"], 1038 file1, 1039 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1040 ["a", "a|a", "a|c", "3|2b"], 1041 ["c", "c|c|c", "a|bc|bc", "2b||3"], 1042 ["", "", "bc", ""]] 1043 ); 1044 testSummarizer(["unittest-sk-6", "-H", "--group-by", "1", "--values", "3,2,1"], 1045 file1, 1046 [["fld1", "fld3_values", "fld2_values", "fld1_values"], 1047 ["a", "3|2b", "a|c", "a|a"], 1048 ["c", "2b||3", "a|bc|bc", "c|c|c"], 1049 ["", "", "bc", ""]] 1050 ); 1051 testSummarizer(["unittest-sk-7", "-H", "--group-by", "1", "--values", "3-1"], 1052 file1, 1053 [["fld1", "fld3_values", "fld2_values", "fld1_values"], 1054 ["a", "3|2b", "a|c", "a|a"], 1055 ["c", "2b||3", "a|bc|bc", "c|c|c"], 1056 ["", "", "bc", ""]] 1057 ); 1058 testSummarizer(["unittest-sk-8", "-H", "--group-by", "2", "--values", "1"], 1059 file1, 1060 [["fld2", "fld1_values"], 1061 ["a", "a|c"], 1062 ["bc", "c||c"], 1063 ["c", "a"]] 1064 ); 1065 testSummarizer(["unittest-sk-9", "-H", "--group-by", "2", "--values", "2"], 1066 file1, 1067 [["fld2", "fld2_values"], 1068 ["a", "a|a"], 1069 ["bc", "bc|bc|bc"], 1070 ["c", "c"]] 1071 ); 1072 testSummarizer(["unittest-sk-10", "-H", "--group-by", "2", "--values", "3"], 1073 file1, 1074 [["fld2", "fld3_values"], 1075 ["a", "3|2b"], 1076 ["bc", "||3"], 1077 ["c", "2b"]] 1078 ); 1079 testSummarizer(["unittest-sk-11", "-H", "--group-by", "2", "--values", "1,3"], 1080 file1, 1081 [["fld2", "fld1_values", "fld3_values"], 1082 ["a", "a|c", "3|2b"], 1083 ["bc", "c||c", "||3"], 1084 ["c", "a", "2b"]] 1085 ); 1086 testSummarizer(["unittest-sk-12", "-H", "--group-by", "2", "--values", "3,1"], 1087 file1, 1088 [["fld2", "fld3_values", "fld1_values"], 1089 ["a", "3|2b", "a|c"], 1090 ["bc", "||3", "c||c"], 1091 ["c", "2b", "a"]] 1092 ); 1093 testSummarizer(["unittest-sk-13", "-H", "--group-by", "3", "--values", "1"], 1094 file1, 1095 [["fld3", "fld1_values"], 1096 ["3", "a|c"], 1097 ["2b", "c|a"], 1098 ["", "c|"]] 1099 ); 1100 testSummarizer(["unittest-sk-14", "-H", "--group-by", "3", "--values", "2"], 1101 file1, 1102 [["fld3", "fld2_values"], 1103 ["3", "a|bc"], 1104 ["2b", "a|c"], 1105 ["", "bc|bc"]] 1106 ); 1107 testSummarizer(["unittest-sk-15", "-H", "--group-by", "3", "--values", "1,2"], 1108 file1, 1109 [["fld3", "fld1_values", "fld2_values"], 1110 ["3", "a|c", "a|bc"], 1111 ["2b", "c|a", "a|c"], 1112 ["", "c|", "bc|bc"]] 1113 ); 1114 1115 /* Multi-key summarizer tests. 1116 */ 1117 testSummarizer(["unittest-mk-1", "--header", "--group-by", "1,2", "--values", "1"], 1118 file1, 1119 [["fld1", "fld2", "fld1_values"], 1120 ["a", "a", "a"], 1121 ["c", "a", "c"], 1122 ["c", "bc", "c|c"], 1123 ["a", "c", "a"], 1124 ["", "bc", ""]] 1125 ); 1126 testSummarizer(["unittest-mk-2", "-H", "--group-by", "1,2", "--values", "2"], 1127 file1, 1128 [["fld1", "fld2", "fld2_values"], 1129 ["a", "a", "a"], 1130 ["c", "a", "a"], 1131 ["c", "bc", "bc|bc"], 1132 ["a", "c", "c"], 1133 ["", "bc", "bc"]] 1134 ); 1135 testSummarizer(["unittest-mk-3", "-H", "--group-by", "1,2", "--values", "3"], 1136 file1, 1137 [["fld1", "fld2", "fld3_values"], 1138 ["a", "a", "3"], 1139 ["c", "a", "2b"], 1140 ["c", "bc", "|3"], 1141 ["a", "c", "2b"], 1142 ["", "bc", ""]] 1143 ); 1144 testSummarizer(["unittest-mk-4", "-H", "--group-by", "1,2", "--values", "3,1"], 1145 file1, 1146 [["fld1", "fld2", "fld3_values", "fld1_values"], 1147 ["a", "a", "3", "a"], 1148 ["c", "a", "2b", "c"], 1149 ["c", "bc", "|3", "c|c"], 1150 ["a", "c", "2b", "a"], 1151 ["", "bc", "", ""]] 1152 ); 1153 testSummarizer(["unittest-mk-5", "-H", "--group-by", "3,2", "--values", "1"], 1154 file1, 1155 [["fld3", "fld2", "fld1_values"], 1156 ["3", "a", "a"], 1157 ["2b", "a", "c"], 1158 ["", "bc", "c|"], 1159 ["2b", "c", "a"], 1160 ["3", "bc", "c"]] 1161 ); 1162 testSummarizer(["unittest-mk-6", "-H", "--group-by", "3-2", "--values", "1"], 1163 file1, 1164 [["fld3", "fld2", "fld1_values"], 1165 ["3", "a", "a"], 1166 ["2b", "a", "c"], 1167 ["", "bc", "c|"], 1168 ["2b", "c", "a"], 1169 ["3", "bc", "c"]] 1170 ); 1171 testSummarizer(["unittest-mk-7", "-H", "--group-by", "2,1,3", "--values", "2"], 1172 file1, 1173 [["fld2", "fld1", "fld3", "fld2_values"], 1174 ["a", "a", "3", "a"], 1175 ["a", "c", "2b", "a"], 1176 ["bc", "c", "", "bc"], 1177 ["c", "a", "2b", "c"], 1178 ["bc", "", "", "bc"], 1179 ["bc", "c", "3", "bc"]] 1180 ); 1181 1182 /* Missing policies. */ 1183 testSummarizer(["unittest-mis-1", "--header", "--group-by", "1", "--values", "1", "--exclude-missing"], 1184 file1, 1185 [["fld1", "fld1_values"], 1186 ["a", "a|a"], 1187 ["c", "c|c|c"], 1188 ["", ""]] 1189 ); 1190 testSummarizer(["unittest-mis-2", "-H", "--group-by", "1", "--values", "2", "-x"], 1191 file1, 1192 [["fld1", "fld2_values"], 1193 ["a", "a|c"], 1194 ["c", "a|bc|bc"], 1195 ["", "bc"]] 1196 ); 1197 testSummarizer(["unittest-mis-3", "-H", "-g", "1", "--values", "3", "-x"], 1198 file1, 1199 [["fld1", "fld3_values"], 1200 ["a", "3|2b"], 1201 ["c", "2b|3"], 1202 ["", ""]] 1203 ); 1204 testSummarizer(["unittest-mis-4", "-H", "--group-by", "1", "--values", "1,2,3", "-x"], 1205 file1, 1206 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1207 ["a", "a|a", "a|c", "3|2b"], 1208 ["c", "c|c|c", "a|bc|bc", "2b|3"], 1209 ["", "", "bc", ""]] 1210 ); 1211 testSummarizer(["unittest-mis-5", "--header", "--group-by", "1", "--values", "1", "--replace-missing", "NA"], 1212 file1, 1213 [["fld1", "fld1_values"], 1214 ["a", "a|a"], 1215 ["c", "c|c|c"], 1216 ["", "NA"]] 1217 ); 1218 testSummarizer(["unittest-mis-6", "-H", "--group-by", "1", "--values", "2", "-r", "NA"], 1219 file1, 1220 [["fld1", "fld2_values"], 1221 ["a", "a|c"], 1222 ["c", "a|bc|bc"], 1223 ["", "bc"]] 1224 ); 1225 testSummarizer(["unittest-mis-7", "-H", "-g", "1", "--values", "3", "-r", "NA"], 1226 file1, 1227 [["fld1", "fld3_values"], 1228 ["a", "3|2b"], 1229 ["c", "2b|NA|3"], 1230 ["", "NA"]] 1231 ); 1232 testSummarizer(["unittest-mis-8", "-H", "--group-by", "1", "--values", "1,2,3", "-r", "NA"], 1233 file1, 1234 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1235 ["a", "a|a", "a|c", "3|2b"], 1236 ["c", "c|c|c", "a|bc|bc", "2b|NA|3"], 1237 ["", "NA", "bc", "NA"]] 1238 ); 1239 testSummarizer(["unittest-mis-9", "-H", "--group-by", "1,2", "--values", "3,1", "-x"], 1240 file1, 1241 [["fld1", "fld2", "fld3_values", "fld1_values"], 1242 ["a", "a", "3", "a"], 1243 ["c", "a", "2b", "c"], 1244 ["c", "bc", "3", "c|c"], 1245 ["a", "c", "2b", "a"], 1246 ["", "bc", "", ""]] 1247 ); 1248 testSummarizer(["unittest-mis-10", "-H", "--group-by", "3,2", "--values", "1", "-x"], 1249 file1, 1250 [["fld3", "fld2", "fld1_values"], 1251 ["3", "a", "a"], 1252 ["2b", "a", "c"], 1253 ["", "bc", "c"], 1254 ["2b", "c", "a"], 1255 ["3", "bc", "c"]] 1256 ); 1257 testSummarizer(["unittest-mis-11", "-H", "--group-by", "2,1,3", "--values", "2", "-x"], 1258 file1, 1259 [["fld2", "fld1", "fld3", "fld2_values"], 1260 ["a", "a", "3", "a"], 1261 ["a", "c", "2b", "a"], 1262 ["bc", "c", "", "bc"], 1263 ["c", "a", "2b", "c"], 1264 ["bc", "", "", "bc"], 1265 ["bc", "c", "3", "bc"]] 1266 ); 1267 testSummarizer(["unittest-mis-12", "-H", "--group-by", "1,2", "--values", "3,1", "-r", "NA"], 1268 file1, 1269 [["fld1", "fld2", "fld3_values", "fld1_values"], 1270 ["a", "a", "3", "a"], 1271 ["c", "a", "2b", "c"], 1272 ["c", "bc", "NA|3", "c|c"], 1273 ["a", "c", "2b", "a"], 1274 ["", "bc", "NA", "NA"]] 1275 ); 1276 testSummarizer(["unittest-mis-13", "-H", "--group-by", "3,2", "--values", "1", "-r", "NA"], 1277 file1, 1278 [["fld3", "fld2", "fld1_values"], 1279 ["3", "a", "a"], 1280 ["2b", "a", "c"], 1281 ["", "bc", "c|NA"], 1282 ["2b", "c", "a"], 1283 ["3", "bc", "c"]] 1284 ); 1285 testSummarizer(["unittest-mis-14", "-H", "--group-by", "2,1,3", "--values", "2", "-r", "NA"], 1286 file1, 1287 [["fld2", "fld1", "fld3", "fld2_values"], 1288 ["a", "a", "3", "a"], 1289 ["a", "c", "2b", "a"], 1290 ["bc", "c", "", "bc"], 1291 ["c", "a", "2b", "c"], 1292 ["bc", "", "", "bc"], 1293 ["bc", "c", "3", "bc"]] 1294 ); 1295 1296 /* Validate that the no-key summarizer works with testSummarizer helper function. 1297 */ 1298 testSummarizer(["unittest-nk-1", "-H", "--values", "1,2"], 1299 file1, 1300 [["fld1_values", "fld2_values"], 1301 ["a|c|c|a||c", "a|a|bc|c|bc|bc"]] 1302 ); 1303 1304 /* Header variations: no header line; auto-generated header line; custom headers. 1305 */ 1306 testSummarizer(["unittest-hdr-1", "--group-by", "1", "--values", "1"], 1307 file1[1..$], 1308 [["a", "a|a"], 1309 ["c", "c|c|c"], 1310 ["", ""]] 1311 ); 1312 testSummarizer(["unittest-hdr-2", "--group-by", "1,2", "--values", "2"], 1313 file1[1..$], 1314 [["a", "a", "a"], 1315 ["c", "a", "a"], 1316 ["c", "bc", "bc|bc"], 1317 ["a", "c", "c"], 1318 ["", "bc", "bc"]] 1319 ); 1320 testSummarizer(["unittest-hdr-3", "--write-header", "--group-by", "2", "--values", "1"], 1321 file1[1..$], 1322 [["field2", "field1_values"], 1323 ["a", "a|c"], 1324 ["bc", "c||c"], 1325 ["c", "a"]] 1326 ); 1327 testSummarizer(["unittest-hdr-4", "-w", "--group-by", "3,2", "--values", "1"], 1328 file1[1..$], 1329 [["field3", "field2", "field1_values"], 1330 ["3", "a", "a"], 1331 ["2b", "a", "c"], 1332 ["", "bc", "c|"], 1333 ["2b", "c", "a"], 1334 ["3", "bc", "c"]] 1335 ); 1336 testSummarizer(["unittest-hdr-5", "-H", "--group-by", "2", "--values", "3:Field3Values"], 1337 file1, 1338 [["fld2", "Field3Values"], 1339 ["a", "3|2b"], 1340 ["bc", "||3"], 1341 ["c", "2b"]] 1342 ); 1343 testSummarizer(["unittest-hdr-6", "-H", "--group-by", "1,2", "--values", "3:FieldThreeValues", "--values", "1:FieldOneValues"], 1344 file1, 1345 [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"], 1346 ["a", "a", "3", "a"], 1347 ["c", "a", "2b", "c"], 1348 ["c", "bc", "|3", "c|c"], 1349 ["a", "c", "2b", "a"], 1350 ["", "bc", "", ""]] 1351 ); 1352 testSummarizer(["unittest-hdr-7", "--write-header", "--group-by", "1", "--values", "3:f3_vals","--values", "2:f2_vals", "--values", "1:f1_vals"], 1353 file1[1..$], 1354 [["field1", "f3_vals", "f2_vals", "f1_vals"], 1355 ["a", "3|2b", "a|c", "a|a"], 1356 ["c", "2b||3", "a|bc|bc", "c|c|c"], 1357 ["", "", "bc", ""]] 1358 ); 1359 testSummarizer(["unittest-hdr-8", "--write-header", "--group-by", "1,3,2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2"], 1360 file1[1..$], 1361 [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"], 1362 ["a", "3", "a", "3", "a", "a"], 1363 ["c", "2b", "a", "2b", "c", "a"], 1364 ["c", "", "bc", "", "c", "bc"], 1365 ["a", "2b", "c", "2b", "a", "c"], 1366 ["", "", "bc", "", "", "bc"], 1367 ["c", "3", "bc", "3", "c", "bc"]] 1368 ); 1369 testSummarizer(["unittest-hdr-9", "--write-header", "--group-by", "1,3-2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2"], 1370 file1[1..$], 1371 [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"], 1372 ["a", "3", "a", "3", "a", "a"], 1373 ["c", "2b", "a", "2b", "c", "a"], 1374 ["c", "", "bc", "", "c", "bc"], 1375 ["a", "2b", "c", "2b", "a", "c"], 1376 ["", "", "bc", "", "", "bc"], 1377 ["c", "3", "bc", "3", "c", "bc"]] 1378 ); 1379 1380 /* Alternate file widths and lengths. 1381 */ 1382 1383 auto file3x2 = [["fld1", "fld2", "fld3"], 1384 ["a", "b", "c"], 1385 ["c", "b", "a"]]; 1386 1387 testSummarizer(["unittest-3x2-1", "-H", "--group-by", "1", "--values", "3"], 1388 file3x2, 1389 [["fld1", "fld3_values"], 1390 ["a", "c"], 1391 ["c", "a"]] 1392 ); 1393 testSummarizer(["unittest-3x2-2", "-H", "--group-by", "2", "--values", "3"], 1394 file3x2, 1395 [["fld2", "fld3_values"], 1396 ["b", "c|a"]] 1397 ); 1398 testSummarizer(["unittest-3x2-3", "-H", "--group-by", "2,1", "--values", "3"], 1399 file3x2, 1400 [["fld2", "fld1", "fld3_values"], 1401 ["b", "a", "c"], 1402 ["b", "c", "a"]] 1403 ); 1404 1405 auto file3x1 = [["fld1", "fld2", "fld3"], 1406 ["a", "b", "c"]]; 1407 1408 testSummarizer(["unittest-3x1-1", "-H", "--group-by", "1", "--values", "3"], 1409 file3x1, 1410 [["fld1", "fld3_values"], 1411 ["a", "c"]] 1412 ); 1413 testSummarizer(["unittest-3x1-2", "--group-by", "1", "--values", "3"], 1414 file3x1[1..$], 1415 [["a", "c"]] 1416 ); 1417 testSummarizer(["unittest-3x1-3", "-H", "--group-by", "2,1", "--values", "3"], 1418 file3x1, 1419 [["fld2", "fld1", "fld3_values"], 1420 ["b", "a", "c"]] 1421 ); 1422 testSummarizer(["unittest-3x1-4", "--group-by", "2,1", "--values", "3"], 1423 file3x1[1..$], 1424 [["b", "a", "c"]] 1425 ); 1426 1427 auto file3x0 = [["fld1", "fld2", "fld3"]]; 1428 1429 testSummarizer(["unittest-3x0-1", "-H", "--group-by", "1", "--values", "3"], 1430 file3x0, 1431 [["fld1", "fld3_values"]] 1432 ); 1433 testSummarizer(["unittest-3x0-2", "--group-by", "1", "--values", "3"], 1434 file3x0[1..$], 1435 [] 1436 ); 1437 testSummarizer(["unittest-3x0-3", "--write-header", "--group-by", "1", "--values", "3"], 1438 file3x0[1..$], 1439 [["field1", "field3_values"]] 1440 ); 1441 1442 1443 testSummarizer(["unittest-3x0-4", "-H", "--group-by", "2,1", "--values", "3"], 1444 file3x0, 1445 [["fld2", "fld1", "fld3_values"]] 1446 ); 1447 1448 testSummarizer(["unittest-3x0-5", "--group-by", "2,1", "--values", "3"], 1449 file3x0[1..$], 1450 [] 1451 ); 1452 1453 testSummarizer(["unittest-3x0-6", "--write-header", "--group-by", "2,1", "--values", "3"], 1454 file3x0[1..$], 1455 [["field2", "field1", "field3_values"]] 1456 ); 1457 1458 auto file2x1 = [["fld1", "fld2"], 1459 ["a", "b"]]; 1460 1461 testSummarizer(["unittest-2x1-1", "-H", "--group-by", "1", "--values", "2"], 1462 file2x1, 1463 [["fld1", "fld2_values"], 1464 ["a", "b"]] 1465 ); 1466 testSummarizer(["unittest-2x1-2", "-H", "--group-by", "2,1", "--values", "1"], 1467 file2x1, 1468 [["fld2", "fld1", "fld1_values"], 1469 ["b", "a", "a"]] 1470 ); 1471 1472 auto file2x0 = [["fld1", "fld2"]]; 1473 1474 testSummarizer(["unittest-2x0-1", "-H", "--group-by", "1", "--values", "2"], 1475 file2x0, 1476 [["fld1", "fld2_values"]] 1477 ); 1478 testSummarizer(["unittest-2x0-2", "-H", "--group-by", "2,1", "--values", "1"], 1479 file2x0, 1480 [["fld2", "fld1", "fld1_values"]] 1481 ); 1482 1483 auto file1x2 = [["fld1"], 1484 ["a"], 1485 [""]]; 1486 1487 testSummarizer(["unittest-1x2-1", "-H", "--group-by", "1", "--values", "1"], 1488 file1x2, 1489 [["fld1", "fld1_values"], 1490 ["a", "a"], 1491 ["", ""]] 1492 ); 1493 1494 auto file1x2b = [["fld1"], 1495 [""], 1496 [""]]; 1497 1498 testSummarizer(["unittest-1x2b-2", "-H", "--group-by", "1", "--values", "1"], 1499 file1x2b, 1500 [["fld1", "fld1_values"], 1501 ["", "|"]] 1502 ); 1503 1504 auto file1x1 = [["fld1"], 1505 ["x"]]; 1506 1507 testSummarizer(["unittest-1x1-1", "-H", "--group-by", "1", "--values", "1"], 1508 file1x1, 1509 [["fld1", "fld1_values"], 1510 ["x", "x"]] 1511 ); 1512 1513 testSummarizer(["unittest-1x1-2", "--group-by", "1", "--values", "1"], 1514 file1x1[1..$], 1515 [["x", "x"]] 1516 ); 1517 1518 testSummarizer(["unittest-1x1-3", "--write-header", "--group-by", "1", "--values", "1"], 1519 file1x1[1..$], 1520 [["field1", "field1_values"], 1521 ["x", "x"]] 1522 ); 1523 1524 auto file1x1b = [["fld1"], 1525 [""]]; 1526 1527 testSummarizer(["unittest-1x1b-1", "-H", "--group-by", "1", "--values", "1"], 1528 file1x1b, 1529 [["fld1", "fld1_values"], 1530 ["", ""]] 1531 ); 1532 1533 auto file1x0 = [["fld1"]]; 1534 1535 testSummarizer(["unittest-1x0-1", "-H", "--group-by", "1", "--values", "1"], 1536 file1x0, 1537 [["fld1", "fld1_values"]] 1538 ); 1539 1540 testSummarizer(["unittest-1x0-2", "--group-by", "1", "--values", "1"], 1541 file1x0[1..$], 1542 [] 1543 ); 1544 1545 testSummarizer(["unittest-1x0-3", "--write-header", "--group-by", "1", "--values", "1"], 1546 file1x0[1..$], 1547 [["field1", "field1_values"]] 1548 ); 1549 1550 /* Alternate delimiters. */ 1551 testSummarizer(["unittest-delim-1", "-H", "--values", "1,2", "--delimiter", "%"], 1552 file1, 1553 [["fld1_values", "fld2_values"], 1554 ["a|c|c|a||c", "a|a|bc|c|bc|bc"]] 1555 ); 1556 testSummarizer(["unittest-delim-2", "-H", "--values", "1-2", "--values-delimiter", "$"], 1557 file1, 1558 [["fld1_values", "fld2_values"], 1559 ["a$c$c$a$$c", "a$a$bc$c$bc$bc"]] 1560 ); 1561 testSummarizer(["unittest-delim-3", "-H", "--values", "1,2", "--delimiter", "#", "--values-delimiter", ","], 1562 file1, 1563 [["fld1_values", "fld2_values"], 1564 ["a,c,c,a,,c", "a,a,bc,c,bc,bc"]] 1565 ); 1566 testSummarizer(["unittest-delim-4", "--write-header", "--group-by", "2", "--values", "1", 1567 "--delimiter", "^", "--values-delimiter", ":"], 1568 file1[1..$], 1569 [["field2", "field1_values"], 1570 ["a", "a:c"], 1571 ["bc", "c::c"], 1572 ["c", "a"]] 1573 ); 1574 testSummarizer(["unittest-delim-5", "--group-by", "1,2", "--values", "2", "--delimiter", "/", 1575 "--values-delimiter", "\\"], 1576 file1[1..$], 1577 [["a", "a", "a"], 1578 ["c", "a", "a"], 1579 ["c", "bc", "bc\\bc"], 1580 ["a", "c", "c"], 1581 ["", "bc", "bc"]] 1582 ); 1583 } 1584 1585 /* Summary Operators and Calculators 1586 * 1587 * Two types of objects are used in implementation: Operators and Calculators. An Operator 1588 * represents a summary calculation specified on the command line, e.g. '--mean 5'. A 1589 * Calculator is used to manage the summary calculation for each unique key in the input. 1590 * 1591 * As an example, consider the command: 1592 * 1593 * $tsv-summarize --group-by 1 --mean 3 --mean 5 1594 * 1595 * This command will create two instances of a MeanOperator, one each for fields 3 and 5. 1596 * They produce the output field headers (e.g. "field3_mean", "field5_mean"). They also 1597 * create MeanCalculator objects for each unique value in field 1. For 'mean', a 1598 * calculator needs to track occurrence count and sum. Calculators produce the final 1599 * value when all processing is finished. 1600 * 1601 * Summary field headers 1602 * 1603 * There are several options for specifying summary field headers. The defaults combine the 1604 * operator name and the header of the field summarized. The defaults can be overridden on 1605 * on the command line. These scenarios are supported via the operator constructor and the 1606 * processHeaderLine() method. 1607 * 1608 * Missing field policy 1609 * 1610 * At present, tsv-summarize has a single policy for handling missing values that applies 1611 * to all operators. However, it is logically operator specific and is implemented that 1612 * way. The MissingFieldPolicy struct describes the policy, each operator contains one. 1613 * Calculators access thier operator's policy struct. 1614 */ 1615 1616 interface Operator 1617 { 1618 @property string header(); 1619 @property string name(); 1620 void processHeaderLine(const char[][] fields); 1621 size_t[] numericFieldsToSave(); // Numeric fields this Operator needs saved 1622 size_t[] textFieldsToSave(); // Text fields this Operator needs saved 1623 Calculator makeCalculator(); 1624 } 1625 1626 interface Calculator 1627 { 1628 void processNextLine(const char[][] fields); 1629 string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions); 1630 } 1631 1632 class MissingFieldPolicy 1633 { 1634 private bool _useMissing = true; // True if missing values are processed unchanged. 1635 private bool _replaceMissing = false; // True if missing values are replaced. 1636 private string _missingReplacement; // Replacement string if replaceMissing is true. 1637 1638 this (in bool excludeMissing = false, in string missingReplacement = "") 1639 { 1640 updatePolicy(excludeMissing, missingReplacement); 1641 } 1642 1643 void updatePolicy(in bool excludeMissing, in string missingReplacement) 1644 { 1645 _missingReplacement = missingReplacement; 1646 _replaceMissing = missingReplacement.length != 0; 1647 _useMissing = !excludeMissing && !replaceMissing; 1648 } 1649 1650 final bool isMissingField(const char[] field) const 1651 { 1652 return field.length == 0; 1653 } 1654 1655 final bool useMissing() const @property 1656 { 1657 return _useMissing; 1658 } 1659 1660 final bool excludeMissing() const @property 1661 { 1662 return !_useMissing && !_replaceMissing; 1663 } 1664 1665 final bool replaceMissing() const @property 1666 { 1667 return _replaceMissing; 1668 } 1669 1670 final string missingReplacement() const @property 1671 { 1672 return _missingReplacement; 1673 } 1674 } 1675 1676 /* The SharedFieldValues and UniqueKeyValuesLists classes manage lists of values collected 1677 * while reading data. Operations like median collect all values and operate on them when 1678 * running the final calculation. Value lists are needed for each unique key. A command 1679 * using multiple Operators may save multiple fields. And, different Operators may be run 1680 * against the same field. 1681 * 1682 * The last part motivates these classes. Handling large data sets necessitates minimizing 1683 * in-memory storage, making it desirable to share identical lists between Calculators. 1684 * Otherwise, each Calculator could implement its own storage, which would be simpler. 1685 * 1686 * The setup works as follows: 1687 * - Operators advertise fields they need saved ([text|numeric]FieldsToSave methods). 1688 * - The SummarizerBase object keeps a SharedFieldValues object, which in turn keeps list 1689 * of the fields advertised by Operators as needing sharing. This list gets created 1690 * during command initialization (SummarizerBase.setOperators). 1691 * - The SharedFieldValues object is used to create a UniqueKeyValuesLists object every 1692 * time a new unique key is found, in parellel to the Calculator objects created for the 1693 * key. The UniqueKeyValuesLists objects are managed by derived Summarizer classes. 1694 * - A unique key's UniqueKeyValuesLists object is passed each input line, same as 1695 * Calculators, saving the values. 1696 * - Calculators retrieve the saved values during the calculation phase. The calculator's 1697 * ProcessNextField method is typically a no-op. 1698 * - Calculators cannot make assumptions about the order of the saved values. This is 1699 * pragmatic concession to median and quantile calculations, which need to sort the data, 1700 * at least partially. Rather than generate sorted copies, the current algorithms 1701 * sort the data in place. 1702 * 1703 * One concession to duplicate storage is that text and numeric versions of the same 1704 * field might be stored. The reason is because it's important to convert text to numbers 1705 * as they are read so that useful error messages can be generated. And, storing both 1706 * forms of the same field should be less common. 1707 * 1708 * The current implementation uses the same missing values policy for all fields. If 1709 * multiple policies become supported this will need to change. 1710 * 1711 * Built-in calculations - UniqueKeyValueLists have a built-in median operation. This is 1712 * to avoid repeated calculations of the median by different calculations. 1713 */ 1714 1715 class SharedFieldValues 1716 { 1717 // Arrays with field indices that need to be saved. 1718 private size_t[] _numericFieldIndices; 1719 private size_t[] _textFieldIndices; 1720 1721 /* Called during summarizer setup to add a shared field value for a specific field index. 1722 * eg. '--median 7' will add end up calling addNumericIdex(6), 6 being the zero-based index. 1723 * A specific index is only added once. 1724 */ 1725 final void addNumericIndex (size_t index) 1726 { 1727 if (!canFind(_numericFieldIndices, index)) _numericFieldIndices ~= index; 1728 } 1729 1730 /* Similar to addNumericIndex, except adds a text index. */ 1731 final void addTextIndex (size_t index) 1732 { 1733 if (!canFind(_textFieldIndices, index)) _textFieldIndices ~= index; 1734 } 1735 1736 /* Called every time a new key is found, or once at the beginning of the program if no keys 1737 * are being used (entire column summarized). 1738 */ 1739 final UniqueKeyValuesLists makeUniqueKeyValuesLists() 1740 { 1741 return new UniqueKeyValuesLists(_numericFieldIndices, _textFieldIndices); 1742 } 1743 } 1744 1745 class UniqueKeyValuesLists 1746 { 1747 /* A FieldValues object holds is a list of values collect for a specific field. A 1748 * unique key may hold several. For example, the command: 1749 * $ tsv-summarize --k 1 --median 4 -- median 5 1750 * requires keeping lists for both fields 4 and 5. This in turn will result in a 1751 * _numericFieldValues being a 2 element array, one with a list of field 4 values, 1752 * the second of field 5 values. Linear search is used to find a specific field. 1753 */ 1754 private FieldValues!double[] _numericFieldValues; 1755 private FieldValues!string[] _textFieldValues; 1756 private double[] _numericFieldMedians; 1757 1758 /* The UniqueKeyValuesLists constructor takes arrays of field indices to be saved. */ 1759 this(const size_t[] numericFieldIndices, const size_t[] textFieldIndices) 1760 { 1761 if (numericFieldIndices.length > 0) 1762 { 1763 _numericFieldValues = new FieldValues!double[](numericFieldIndices.length); 1764 foreach (i, fieldIndex; numericFieldIndices) 1765 _numericFieldValues[i] = new FieldValues!double(fieldIndex); 1766 } 1767 1768 if (textFieldIndices.length > 0) 1769 { 1770 _textFieldValues = new FieldValues!string[](textFieldIndices.length); 1771 foreach (i, fieldIndex; textFieldIndices) 1772 _textFieldValues[i] = new FieldValues!string(fieldIndex); 1773 } 1774 } 1775 1776 void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy) 1777 { 1778 _numericFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy)); 1779 _textFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy)); 1780 } 1781 1782 private FieldValues!double findNumericFieldValues(size_t index) 1783 { 1784 alias pred = (FieldValues!double a, size_t b) => (a.fieldIndex == b); 1785 auto r = find!pred(_numericFieldValues, index); 1786 assert(!r.empty); 1787 return r.front; 1788 } 1789 1790 private FieldValues!string findTextFieldValues(size_t index) 1791 { 1792 alias pred = (FieldValues!string a, size_t b) => (a.fieldIndex == b); 1793 auto r = find!pred(_textFieldValues, index); 1794 assert(!r.empty); 1795 return r.front; 1796 } 1797 1798 final double[] numericValues(size_t index) 1799 { 1800 return findNumericFieldValues(index).getArray; 1801 } 1802 1803 final double[] numericValuesSorted(size_t index) 1804 { 1805 return findNumericFieldValues(index).getSortedArray; 1806 } 1807 1808 final string[] textValues(size_t index) 1809 { 1810 return findTextFieldValues(index).getArray; 1811 } 1812 1813 final string[] textValuesSorted(size_t index) 1814 { 1815 return findTextFieldValues(index).getSortedArray; 1816 } 1817 1818 final double numericValuesMedian(size_t index) 1819 { 1820 return findNumericFieldValues(index).median; 1821 } 1822 1823 private class FieldValues(ValueType) 1824 { 1825 import std.array : appender; 1826 private size_t _fieldIndex; 1827 private Appender!(ValueType[]) _values; 1828 private bool _haveMedian = false; 1829 private bool _isSorted = false; 1830 private ValueType _medianValue; 1831 1832 this(size_t fieldIndex) 1833 { 1834 _fieldIndex = fieldIndex; 1835 } 1836 1837 final size_t length() const @property 1838 { 1839 return _values.data.length; 1840 } 1841 1842 final size_t fieldIndex() const @property 1843 { 1844 return _fieldIndex; 1845 } 1846 1847 final void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy) 1848 { 1849 debug writefln("[%s]: %s", __FUNCTION__, fields.to!string); 1850 1851 const char[] field = fields[_fieldIndex]; 1852 if (missingPolicy.useMissing || !missingPolicy.isMissingField(field)) 1853 { 1854 _values.put(field.to!ValueType); 1855 _haveMedian = false; 1856 _isSorted = false; 1857 } 1858 else if (missingPolicy.replaceMissing) 1859 { 1860 _values.put(missingPolicy.missingReplacement.to!ValueType); 1861 _haveMedian = false; 1862 _isSorted = false; 1863 } 1864 } 1865 1866 /* Return an input range of the values. */ 1867 final auto values() 1868 { 1869 return _values.data; 1870 } 1871 1872 final ValueType[] getArray() 1873 { 1874 return _values.data; 1875 } 1876 1877 final ValueType[] getSortedArray() 1878 { 1879 if (!_isSorted) 1880 { 1881 import std.algorithm : sort; 1882 sort(_values.data); 1883 _isSorted = true; 1884 } 1885 return _values.data; 1886 } 1887 1888 final ValueType median() 1889 { 1890 if (!_haveMedian) 1891 { 1892 import tsv_numerics : rangeMedian; 1893 _medianValue = _values.data.rangeMedian(); 1894 _haveMedian = true; 1895 } 1896 1897 return _medianValue; 1898 } 1899 } 1900 } 1901 1902 /* SingleFieldOperator is a base class for single field operators, the most common 1903 * Operator. Derived classes implement makeCalculator and the Calculator class it returns. 1904 */ 1905 class SingleFieldOperator : Operator 1906 { 1907 import std.typecons : Flag; 1908 1909 private string _name; 1910 private string _header; 1911 private size_t _fieldIndex; 1912 private bool _useHeaderSuffix; 1913 private bool _allowCustomHeader; 1914 private bool _hasCustomHeader = false; 1915 private size_t[] _numericFieldsToSave; 1916 private size_t[] _textFieldsToSave; 1917 private MissingFieldPolicy _missingPolicy; 1918 1919 this(string operatorName, size_t fieldIndex, MissingFieldPolicy missingPolicy, 1920 Flag!"useHeaderSuffix" useHeaderSuffix = Yes.useHeaderSuffix, 1921 Flag!"allowCustomHeader" allowCustomHeader = Yes.allowCustomHeader) 1922 { 1923 _name = operatorName; 1924 _fieldIndex = fieldIndex; 1925 _missingPolicy = missingPolicy; 1926 _useHeaderSuffix = useHeaderSuffix; 1927 _allowCustomHeader = allowCustomHeader; 1928 // Default header. May be overrridden by custom header or header line. 1929 _header = 1930 fieldHeaderFromIndex(fieldIndex) 1931 .summaryHeaderFromFieldHeader(_useHeaderSuffix ? operatorName : ""); 1932 } 1933 1934 void setCustomHeader (string customHeader) 1935 { 1936 assert(_allowCustomHeader); 1937 _header = customHeader; 1938 _hasCustomHeader = true; 1939 } 1940 1941 final string name() const @property 1942 { 1943 return _name; 1944 } 1945 1946 final bool allowCustomHeader() const @property 1947 { 1948 return _allowCustomHeader; 1949 } 1950 1951 /* saveFieldValues[Numeric|Text] are called by derived classes to indicate that field 1952 * that the field values should be saved. These should called during construction. 1953 */ 1954 final void setSaveFieldValuesNumeric() 1955 { 1956 _numericFieldsToSave ~= _fieldIndex; 1957 } 1958 1959 final void setSaveFieldValuesText() 1960 { 1961 _textFieldsToSave ~= _fieldIndex; 1962 } 1963 1964 final MissingFieldPolicy missingPolicy() @property 1965 { 1966 return _missingPolicy; 1967 } 1968 1969 final size_t fieldIndex() const @property 1970 { 1971 return _fieldIndex; 1972 } 1973 1974 final string header() const @property 1975 { 1976 return _header; 1977 } 1978 1979 final bool useHeaderSuffix() const @property 1980 { 1981 return _useHeaderSuffix; 1982 } 1983 1984 void processHeaderLine(const char[][] fields) 1985 { 1986 if (!_hasCustomHeader) { 1987 debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string); 1988 _header = summaryHeaderFromFieldHeader(fields[_fieldIndex].to!string, 1989 _useHeaderSuffix ? _name : ""); 1990 } 1991 } 1992 1993 final size_t[] numericFieldsToSave() 1994 { 1995 return _numericFieldsToSave; 1996 } 1997 1998 final size_t[] textFieldsToSave() 1999 { 2000 return _textFieldsToSave; 2001 } 2002 2003 abstract SingleFieldCalculator makeCalculator(); 2004 } 2005 2006 /* SingleFieldCalculator is a base class for the common case of calculators using a single 2007 * field. Derived classes implement processNextField() rather than processNextLine(). 2008 */ 2009 class SingleFieldCalculator : Calculator 2010 { 2011 private size_t _fieldIndex; 2012 2013 this(size_t fieldIndex) 2014 { 2015 _fieldIndex = fieldIndex; 2016 } 2017 2018 final size_t fieldIndex() const @property 2019 { 2020 return _fieldIndex; 2021 } 2022 2023 final void processNextLine(const char[][] fields) 2024 { 2025 debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string); 2026 2027 auto missingPolicy = getOperator.missingPolicy; 2028 const char[] field = fields[_fieldIndex]; 2029 2030 if (missingPolicy.useMissing || !missingPolicy.isMissingField(field)) 2031 { 2032 processNextField(field); 2033 } 2034 else if (missingPolicy.replaceMissing) 2035 { 2036 processNextField(missingPolicy.missingReplacement); 2037 } 2038 } 2039 2040 abstract SingleFieldOperator getOperator(); 2041 2042 abstract void processNextField(const char[] field); 2043 } 2044 2045 /* Unittest helper functions. Only compiled when -unittest is in effect. */ 2046 version(unittest) 2047 { 2048 /** A helper for SingleFieldOperator unit tests. 2049 * 2050 * testSingleFieldOperator takes a set of split file values, a field index, a header 2051 * suffix, and a set of expected values. The expected values array contains the 2052 * initial value (zero entries) and the expected values after each line. (One more 2053 * expected value than input lines.) The zero entry case is what is generated for an 2054 * empty file. An example testing the 'min' operator against a file with 2 columns, 2055 * 3 rows, using field index 1: 2056 * 2057 * testSingleFieldOperator!MinOperator( 2058 * [["10", "100"], // The split file. 3 lines by 2 rows. 2059 * ["5", "50"], 2060 * ["20", "200"]], 2061 * 1, // Field index (zero-based, so "100", "50", "200") 2062 * "min", // The header suffix, normally the operator name. 2063 * ["nan", "100", "50", "50"]); // Min value after processing each line. 2064 * 2065 * A typical operator unit test uses three "files", one each of 1x3, 2x3, and 3x3. 2066 * Then run the operator is tested against each column, a total of six calls. Headers 2067 * are automatically checked. Additional entries can be used to extend coverage. 2068 * 2069 * A non-default MissingFieldPolicy can be provide as an optional last argument. 2070 * Operator tests should include exclusion and replacement variations. See operator 2071 * unit tests for details. 2072 * 2073 * The testSingleFieldOperatorBase adds an additional capability - Custom operator 2074 * init arguments. Currently this is used only by the quantile operator. 2075 * 2076 * These tests do not check unique key behavior (group-by). Operators don't have info 2077 * about unique keys, and interact with them only indirectly, via Calculators. 2078 */ 2079 void testSingleFieldOperator(OperatorClass : SingleFieldOperator) 2080 (const char[][][] splitFile, size_t fieldIndex, string headerSuffix, 2081 const char[][] expectedValues, 2082 MissingFieldPolicy missingPolicy = new MissingFieldPolicy) 2083 { 2084 testSingleFieldOperatorBase!OperatorClass(splitFile, fieldIndex, headerSuffix, expectedValues, missingPolicy); 2085 } 2086 2087 void testSingleFieldOperatorBase(OperatorClass : SingleFieldOperator, T...) 2088 (const char[][][] splitFile, size_t fieldIndex, string headerSuffix, 2089 const char[][] expectedValues, 2090 MissingFieldPolicy missingPolicy, 2091 T extraOpInitArgs) 2092 { 2093 import std.format : format; 2094 import std.array : appender; 2095 import std.string : chomp; 2096 import std.traits : EnumMembers; 2097 2098 auto numFields = (splitFile[0]).length; 2099 2100 assert(fieldIndex < numFields, 2101 format("[testSingleFieldOperator] Invalid field index. headerSuffix: %s", 2102 headerSuffix)); 2103 assert(splitFile.length + 1 == expectedValues.length, 2104 format("[testSingleFieldOperator] Need one more expected value than number of rows. headerSuffix: %s", 2105 headerSuffix)); 2106 2107 /* printOptions - Only the 'values-delimiter' (2nd arg) is used these tests. */ 2108 auto printOptions = SummarizerPrintOptions('#', '|'); 2109 2110 /* An input header line. */ 2111 string[] inputHeaderLine = new string[numFields]; 2112 foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string; 2113 2114 /* The different expected output field headers. */ 2115 auto outputFieldHeaderWithNoHeaderLine = 2116 fieldHeaderFromIndex(fieldIndex) 2117 .summaryHeaderFromFieldHeader(headerSuffix); 2118 auto outputFieldHeaderFromHeaderLine = 2119 inputHeaderLine[fieldIndex] 2120 .summaryHeaderFromFieldHeader(headerSuffix); 2121 auto customOutputFieldHeader = "custom"; 2122 2123 enum HeaderUsecase { 2124 HeaderLine_DefaultHeader, 2125 HeaderLine_CustomHeader, 2126 NoHeaderLine_DefaultHeader, 2127 NoHeaderLine_CustomHeader, 2128 NoHeaderLine_NoOutputHeader, 2129 } 2130 2131 string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected) 2132 { 2133 return format("[testSingleFieldOperator] Unexpected header. Operator: %s; Usecase: %s; Actual: '%s'; Expected: '%s'", 2134 op.name, hc, actual, expected); 2135 } 2136 2137 string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex, size_t fieldIndex, 2138 const char[] actual, const char[] expected) 2139 { 2140 return format("[testSingleFieldOperator] Unexpected value. Operator: %s; Usecase: %s; RowIndex: %d, FieldIndex: %d\n Actual: '%s'; Expected: '%s'", 2141 op.name, hc, rowIndex, fieldIndex, actual, expected); 2142 } 2143 2144 /* Run the logic for each header use case. */ 2145 foreach (hc; EnumMembers!HeaderUsecase) 2146 { 2147 bool hasInputHeader = ( 2148 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2149 hc == HeaderUsecase.HeaderLine_CustomHeader 2150 ); 2151 bool hasOutputHeader = ( 2152 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2153 hc == HeaderUsecase.HeaderLine_CustomHeader || 2154 hc == HeaderUsecase.NoHeaderLine_DefaultHeader || 2155 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2156 ); 2157 bool hasCustomHeader = ( 2158 hc == HeaderUsecase.HeaderLine_CustomHeader || 2159 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2160 ); 2161 2162 if (hasCustomHeader) assert(hasOutputHeader); 2163 2164 auto op = new OperatorClass(fieldIndex, missingPolicy, extraOpInitArgs); 2165 2166 if (hasCustomHeader) 2167 { 2168 if (!op.allowCustomHeader) continue; // Custom header not support by this operator 2169 op.setCustomHeader(customOutputFieldHeader); 2170 } 2171 2172 Operator[] operatorArray; 2173 operatorArray ~= op; 2174 2175 auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy); 2176 summarizer.setOperators(inputRangeObject(operatorArray)); 2177 2178 if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine); 2179 2180 if (hasOutputHeader) 2181 { 2182 /* Write the header line. Note that this is a one-field header, */ 2183 auto headerLineOutput = appender!(char[])(); 2184 summarizer.writeSummaryHeader(headerLineOutput, printOptions); 2185 2186 /* Test that the header was generated correctly. 2187 * 2188 * Note: Because the output is generated by a Summarizer, it will have a 2189 * trailing newline. Use chomp to trim it. 2190 */ 2191 final switch (hc) 2192 { 2193 case HeaderUsecase.HeaderLine_DefaultHeader: 2194 assert(headerLineOutput.data.chomp == outputFieldHeaderFromHeaderLine, 2195 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2196 outputFieldHeaderFromHeaderLine)); 2197 break; 2198 case HeaderUsecase.NoHeaderLine_DefaultHeader: 2199 assert(headerLineOutput.data.chomp == outputFieldHeaderWithNoHeaderLine, 2200 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2201 outputFieldHeaderWithNoHeaderLine)); 2202 break; 2203 case HeaderUsecase.HeaderLine_CustomHeader: 2204 case HeaderUsecase.NoHeaderLine_CustomHeader: 2205 assert(headerLineOutput.data.chomp == customOutputFieldHeader, 2206 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2207 customOutputFieldHeader)); 2208 break; 2209 case HeaderUsecase.NoHeaderLine_NoOutputHeader: 2210 break; 2211 } 2212 2213 } 2214 2215 /* For each line, process the line, generate the output, and test that the 2216 * value is correct. Start with the empty file case. 2217 */ 2218 foreach (i, const char[] expected; expectedValues) 2219 { 2220 if (i > 0) summarizer.processNextLine(splitFile[i - 1]); 2221 auto summaryLineOutput = appender!(char[])(); 2222 summarizer.writeSummaryBody(summaryLineOutput, printOptions); 2223 assert(summaryLineOutput.data.chomp == expected, 2224 valueAssertMessage(operatorArray[0], hc, i, fieldIndex, 2225 summaryLineOutput.data.chomp, expectedValues[i])); 2226 } 2227 } 2228 } 2229 } 2230 2231 /* ZeroFieldOperator is a base class for operators that take no input. The main use 2232 * case is the CountOperator, which counts the occurrences of each unique key. Other 2233 * uses are possible, for example, weighted random number assignment. 2234 * 2235 * The primary rationale for ZeroFieldOperator and ZeroFieldCalculator is to clarify 2236 * the information available to such a routine. In particular, the split fields passed 2237 * to processHeaderLine and processNextLine don't include all fields in the input, 2238 * something that might not be obvious when implementing an operator. (Only fields 2239 * required by operators acting on specific fields are included.) 2240 */ 2241 class ZeroFieldOperator : Operator 2242 { 2243 import std.typecons : Flag; 2244 2245 private string _name; 2246 private string _header; 2247 2248 this(string operatorName) 2249 { 2250 _name = operatorName; 2251 _header = operatorName; 2252 } 2253 2254 void setCustomHeader (string customHeader) 2255 { 2256 _header = customHeader; 2257 } 2258 2259 bool allowCustomHeader() const @property 2260 { 2261 return true; 2262 } 2263 2264 final string name() const @property 2265 { 2266 return _name; 2267 } 2268 2269 final string header() const @property 2270 { 2271 return _header; 2272 } 2273 2274 /* A no-op. ZeroFieldOperators have no access to the header line. */ 2275 final void processHeaderLine(const char[][] fields) { } 2276 2277 /* A no-op. ZeroFieldOperators have no access to fields. */ 2278 final size_t[] numericFieldsToSave() 2279 { 2280 size_t[] emptyArray; 2281 return emptyArray; 2282 } 2283 2284 /* A no-op. ZeroFieldOperators have no access to fields. */ 2285 final size_t[] textFieldsToSave() 2286 { 2287 size_t[] emptyArray; 2288 return emptyArray; 2289 } 2290 2291 abstract ZeroFieldCalculator makeCalculator(); 2292 } 2293 2294 /* ZeroFieldCalculator is a base class for operators that don't use fields as input. 2295 * In particular, the Count operator. It is a companion to the ZeroFieldOperator class. 2296 * 2297 * Derived classes implement processNextEntry() rather than processNextLine(), and the 2298 * single argument form of calculate() given as an abstract function. 2299 */ 2300 class ZeroFieldCalculator : Calculator 2301 { 2302 this() { } 2303 2304 final void processNextLine(const char[][] fields) 2305 { 2306 debug writefln("[%s]", __FUNCTION__,); 2307 processNextEntry(); 2308 } 2309 2310 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2311 { 2312 return calculate(printOptions); 2313 } 2314 2315 abstract void processNextEntry(); 2316 abstract string calculate(const ref SummarizerPrintOptions printOptions); 2317 } 2318 2319 version(unittest) 2320 { 2321 /** A helper for ZeroFieldOperator unit tests. 2322 * 2323 * testZeroFieldOperator takes a set of split file values, a default header, and a 2324 * set of expected values. The expected values array contains the expected values 2325 * after each line. 2326 * 2327 * testZeroFieldOperator is very similar to testSingleFieldOperator, except that 2328 * there is no use of field indices and fewer types of headers. See the latter's 2329 * documentation and the CountOperator unit tests for examples. 2330 */ 2331 void testZeroFieldOperator(OperatorClass : ZeroFieldOperator) 2332 (const char[][][] splitFile, string defaultHeader, const char[][] expectedValues) 2333 { 2334 import std.format : format; 2335 import std.array : appender; 2336 import std.string : chomp; 2337 import std.traits : EnumMembers; 2338 2339 auto numFields = (splitFile[0]).length; 2340 2341 assert(splitFile.length + 1 == expectedValues.length, 2342 format("[testZeroFieldOperator] Need one more expected value than number of rows. headerSuffix: %s", 2343 defaultHeader)); 2344 2345 /* printOptions - Not used these tests, but needed for API calls. */ 2346 auto printOptions = SummarizerPrintOptions('#', '|'); 2347 2348 /* Missing policy doesn't apply to zero field operators, but need the object for the summarizer. */ 2349 auto missingPolicy = new MissingFieldPolicy; 2350 2351 /* An input header line. */ 2352 string[] inputHeaderLine = new string[numFields]; 2353 foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string; 2354 2355 auto customOutputFieldHeader = "custom"; 2356 2357 enum HeaderUsecase { 2358 HeaderLine_DefaultHeader, 2359 HeaderLine_CustomHeader, 2360 NoHeaderLine_DefaultHeader, 2361 NoHeaderLine_CustomHeader, 2362 NoHeaderLine_NoOutputHeader, 2363 } 2364 2365 string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected) 2366 { 2367 return format("[testZeroFieldOperator] Unexpected header. Operator: %s; Usecase: %s; Actual: '%s'; Expected: '%s'", 2368 op.name, hc, actual, expected); 2369 } 2370 2371 string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex, 2372 const char[] actual, const char[] expected) 2373 { 2374 return format("[testZeroFieldOperator] Unexpected value. Operator: %s; Usecase: %s; RowIndex: %d\n Actual: '%s'; Expected: '%s'", 2375 op.name, hc, rowIndex, actual, expected); 2376 } 2377 2378 /* Run the logic for each header use case. */ 2379 foreach (hc; EnumMembers!HeaderUsecase) 2380 { 2381 bool hasInputHeader = ( 2382 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2383 hc == HeaderUsecase.HeaderLine_CustomHeader 2384 ); 2385 bool hasOutputHeader = ( 2386 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2387 hc == HeaderUsecase.HeaderLine_CustomHeader || 2388 hc == HeaderUsecase.NoHeaderLine_DefaultHeader || 2389 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2390 ); 2391 bool hasCustomHeader = ( 2392 hc == HeaderUsecase.HeaderLine_CustomHeader || 2393 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2394 ); 2395 2396 if (hasCustomHeader) assert(hasOutputHeader); 2397 2398 auto op = new OperatorClass(); 2399 2400 if (hasCustomHeader) 2401 { 2402 if (!op.allowCustomHeader) continue; // Custom header not support by this operator 2403 op.setCustomHeader(customOutputFieldHeader); 2404 } 2405 2406 Operator[] operatorArray; 2407 operatorArray ~= op; 2408 2409 auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy); 2410 summarizer.setOperators(inputRangeObject(operatorArray)); 2411 if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine); 2412 2413 if (hasOutputHeader) 2414 { 2415 /* Write the header line. Note that this is a one-field header, */ 2416 auto headerLineOutput = appender!(char[])(); 2417 summarizer.writeSummaryHeader(headerLineOutput, printOptions); 2418 2419 /* Test that the header was generated correctly. 2420 * 2421 * Note: Because the output is generated by a Summarizer, it will have a 2422 * trailing newline. Use chomp to trim it. 2423 */ 2424 final switch (hc) 2425 { 2426 case HeaderUsecase.HeaderLine_DefaultHeader: 2427 case HeaderUsecase.NoHeaderLine_DefaultHeader: 2428 assert(headerLineOutput.data.chomp == defaultHeader, 2429 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2430 defaultHeader)); 2431 break; 2432 case HeaderUsecase.HeaderLine_CustomHeader: 2433 case HeaderUsecase.NoHeaderLine_CustomHeader: 2434 assert(headerLineOutput.data.chomp == customOutputFieldHeader, 2435 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2436 customOutputFieldHeader)); 2437 break; 2438 case HeaderUsecase.NoHeaderLine_NoOutputHeader: 2439 break; 2440 } 2441 2442 } 2443 2444 /* For each line, process the line, generate the output, and test that the 2445 * value is correct. Start with the empty file case. 2446 */ 2447 foreach (i, const char[] expected; expectedValues) 2448 { 2449 if (i > 0) summarizer.processNextLine(splitFile[i - 1]); 2450 auto summaryLineOutput = appender!(char[])(); 2451 summarizer.writeSummaryBody(summaryLineOutput, printOptions); 2452 assert(summaryLineOutput.data.chomp == expected, 2453 valueAssertMessage(operatorArray[0], hc, i, 2454 summaryLineOutput.data.chomp, expectedValues[i])); 2455 } 2456 } 2457 } 2458 } 2459 2460 /* Specific operators. 2461 * 2462 * Notes: 2463 * - The 'Calculator' inner classes are 'static'. This means inner class instances do not 2464 * keep a reference to the context of the outer class. In exchange, Calculator instances 2465 * need to hold all needed state, typically the field index they are summarizing. 2466 */ 2467 2468 /** CountOperator counts the number of occurrences of each unique key, or the number of 2469 * input lines if there is no unique key. 2470 * 2471 * CountOperator differs from most other operators in that it doesn't summarize a specific 2472 * field on the line. Instead it is summarizing a property of the unique key itself. For 2473 * this reason it doesn't derive from SingleFieldOperator. 2474 */ 2475 class CountOperator : ZeroFieldOperator 2476 { 2477 this() 2478 { 2479 super("count"); 2480 } 2481 2482 final override ZeroFieldCalculator makeCalculator() 2483 { 2484 return new CountCalculator(); 2485 } 2486 2487 static class CountCalculator : ZeroFieldCalculator 2488 { 2489 private size_t _count = 0; 2490 2491 final override void processNextEntry() 2492 { 2493 _count++; 2494 } 2495 2496 final override string calculate(const ref SummarizerPrintOptions printOptions) 2497 { 2498 return printOptions.formatNumber(_count); 2499 } 2500 } 2501 } 2502 2503 unittest // CountOperator 2504 { 2505 auto col1File = [["10"], ["9.5"], ["11"]]; 2506 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 2507 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 2508 2509 testZeroFieldOperator!CountOperator(col1File, "count", ["0", "1", "2", "3"]); 2510 testZeroFieldOperator!CountOperator(col2File, "count", ["0", "1", "2", "3"]); 2511 testZeroFieldOperator!CountOperator(col3File, "count", ["0", "1", "2", "3"]); 2512 } 2513 2514 /** RetainOperator retains the first occurrence of a field, without changing the header. 2515 * 2516 * RetainOperator is intended for fields where the value is expected to be the same for 2517 * all occurrences of the unique key, and the goal is to pass the value through unchanged. 2518 * It is like FirstOperator, except that the original header is preserved. The original 2519 * header preservation is setup in the call to the SingleFieldOperation constructor. 2520 * 2521 * Notes: 2522 * - An option to signal an error if multiple values are encountered might be useful. 2523 */ 2524 class RetainOperator : SingleFieldOperator 2525 { 2526 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2527 { 2528 super("retain", fieldIndex, missingPolicy, No.useHeaderSuffix, No.allowCustomHeader); 2529 } 2530 2531 final override SingleFieldCalculator makeCalculator() 2532 { 2533 return new RetainCalculator(fieldIndex); 2534 } 2535 2536 class RetainCalculator : SingleFieldCalculator 2537 { 2538 private bool _done = false; 2539 private string _value = ""; 2540 2541 this(size_t fieldIndex) 2542 { 2543 super(fieldIndex); 2544 } 2545 2546 final override RetainOperator getOperator() 2547 { 2548 return this.outer; 2549 } 2550 2551 final override void processNextField(const char[] nextField) 2552 { 2553 if (!_done) 2554 { 2555 _value = nextField.to!string; 2556 _done = true; 2557 } 2558 } 2559 2560 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2561 { 2562 return _value; 2563 } 2564 } 2565 } 2566 2567 unittest // RetainOperator 2568 { 2569 auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]]; 2570 auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]]; 2571 auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]]; 2572 2573 testSingleFieldOperator!RetainOperator(col1File, 0, "", ["", "r1c1", "r1c1", "r1c1"]); 2574 testSingleFieldOperator!RetainOperator(col2File, 0, "", ["", "r1c1", "r1c1", "r1c1"]); 2575 testSingleFieldOperator!RetainOperator(col2File, 1, "", ["", "r1c2", "r1c2", "r1c2"]); 2576 testSingleFieldOperator!RetainOperator(col3File, 0, "", ["", "r1c1", "r1c1", "r1c1"]); 2577 testSingleFieldOperator!RetainOperator(col3File, 1, "", ["", "r1c2", "r1c2", "r1c2"]); 2578 testSingleFieldOperator!RetainOperator(col3File, 2, "", ["", "r1c3", "r1c3", "r1c3"]); 2579 2580 auto col1misFile = [[""], ["r2c1"], ["r3c1"]]; 2581 testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "", "r2c1", "r2c1"], 2582 new MissingFieldPolicy(true, "")); // Exclude missing 2583 testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "NA", "NA", "NA"], 2584 new MissingFieldPolicy(false, "NA")); // Replace missing 2585 } 2586 2587 /** FirstOperator outputs the first value found for the field. 2588 */ 2589 class FirstOperator : SingleFieldOperator 2590 { 2591 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2592 { 2593 super("first", fieldIndex, missingPolicy); 2594 } 2595 2596 final override SingleFieldCalculator makeCalculator() 2597 { 2598 return new FirstCalculator(fieldIndex); 2599 } 2600 2601 class FirstCalculator : SingleFieldCalculator 2602 { 2603 private bool _done = false; 2604 private string _value = ""; 2605 2606 this(size_t fieldIndex) 2607 { 2608 super(fieldIndex); 2609 } 2610 2611 final override FirstOperator getOperator() 2612 { 2613 return this.outer; 2614 } 2615 2616 final override void processNextField(const char[] nextField) 2617 { 2618 if (!_done) 2619 { 2620 _value = nextField.to!string; 2621 _done = true; 2622 } 2623 } 2624 2625 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2626 { 2627 return _value; 2628 } 2629 } 2630 } 2631 2632 unittest // FirstOperator 2633 { 2634 auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]]; 2635 auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]]; 2636 auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]]; 2637 2638 testSingleFieldOperator!FirstOperator(col1File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]); 2639 testSingleFieldOperator!FirstOperator(col2File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]); 2640 testSingleFieldOperator!FirstOperator(col2File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]); 2641 testSingleFieldOperator!FirstOperator(col3File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]); 2642 testSingleFieldOperator!FirstOperator(col3File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]); 2643 testSingleFieldOperator!FirstOperator(col3File, 2, "first", ["", "r1c3", "r1c3", "r1c3"]); 2644 2645 auto col1misFile = [[""], ["r2c1"], ["r3c1"]]; 2646 testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "", "r2c1", "r2c1"], 2647 new MissingFieldPolicy(true, "")); // Exclude missing 2648 testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "NA", "NA", "NA"], 2649 new MissingFieldPolicy(false, "NA")); // Replace missing 2650 } 2651 2652 /** LastOperator outputs the last value found for the field. 2653 */ 2654 class LastOperator : SingleFieldOperator 2655 { 2656 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2657 { 2658 super("last", fieldIndex, missingPolicy); 2659 } 2660 2661 final override SingleFieldCalculator makeCalculator() 2662 { 2663 return new LastCalculator(fieldIndex); 2664 } 2665 2666 class LastCalculator : SingleFieldCalculator 2667 { 2668 private string _value = ""; 2669 2670 this(size_t fieldIndex) 2671 { 2672 super(fieldIndex); 2673 } 2674 2675 final override LastOperator getOperator() 2676 { 2677 return this.outer; 2678 } 2679 2680 final override void processNextField(const char[] nextField) 2681 { 2682 _value = nextField.to!string; 2683 } 2684 2685 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2686 { 2687 return _value; 2688 } 2689 } 2690 } 2691 2692 unittest // LastOperator 2693 { 2694 auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]]; 2695 auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]]; 2696 auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]]; 2697 2698 testSingleFieldOperator!LastOperator(col1File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]); 2699 testSingleFieldOperator!LastOperator(col2File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]); 2700 testSingleFieldOperator!LastOperator(col2File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]); 2701 testSingleFieldOperator!LastOperator(col3File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]); 2702 testSingleFieldOperator!LastOperator(col3File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]); 2703 testSingleFieldOperator!LastOperator(col3File, 2, "last", ["", "r1c3", "r2c3", "r3c3"]); 2704 2705 auto col1misFile = [[""], ["r2c1"], ["r3c1"]]; 2706 testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "", "r2c1", "r3c1"], 2707 new MissingFieldPolicy(true, "")); // Exclude missing 2708 testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "NA", "r2c1", "r3c1"], 2709 new MissingFieldPolicy(false, "NA")); // Replace missing 2710 } 2711 2712 /* MinOperator output the minimum value for the field. This is a numeric operator. 2713 */ 2714 class MinOperator : SingleFieldOperator 2715 { 2716 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2717 { 2718 super("min", fieldIndex, missingPolicy); 2719 } 2720 2721 final override SingleFieldCalculator makeCalculator() 2722 { 2723 return new MinCalculator(fieldIndex); 2724 } 2725 2726 class MinCalculator : SingleFieldCalculator 2727 { 2728 private bool _isFirst = true; 2729 private double _value = double.nan; 2730 2731 this(size_t fieldIndex) 2732 { 2733 super(fieldIndex); 2734 } 2735 2736 final override MinOperator getOperator() 2737 { 2738 return this.outer; 2739 } 2740 2741 final override void processNextField(const char[] nextField) 2742 { 2743 double fieldValue = nextField.to!double; 2744 if (_isFirst) 2745 { 2746 _value = fieldValue; 2747 _isFirst = false; 2748 } 2749 else if (fieldValue < _value) 2750 { 2751 _value = fieldValue; 2752 } 2753 } 2754 2755 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2756 { 2757 return printOptions.formatNumber(_value); 2758 } 2759 } 2760 } 2761 2762 unittest // MinOperator 2763 { 2764 auto col1File = [["10"], ["9.5"], ["11"]]; 2765 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 2766 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 2767 2768 testSingleFieldOperator!MinOperator(col1File, 0, "min", ["nan", "10", "9.5", "9.5"]); 2769 testSingleFieldOperator!MinOperator(col2File, 0, "min", ["nan", "20", "20", "20"]); 2770 testSingleFieldOperator!MinOperator(col2File, 1, "min", ["nan", "-30", "-30", "-31"]); 2771 testSingleFieldOperator!MinOperator(col3File, 0, "min", ["nan", "9009", "199", "199"]); 2772 testSingleFieldOperator!MinOperator(col3File, 1, "min", ["nan", "9", "0", "0"]); 2773 testSingleFieldOperator!MinOperator(col3File, 2, "min", ["nan", "-4.5", "-4.5", "-4.5"]); 2774 2775 auto col1misFile = [[""], ["10"], ["-10"]]; 2776 testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "nan", "10", "-10"], 2777 new MissingFieldPolicy(true, "")); // Exclude missing 2778 testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "5", "5", "-10"], 2779 new MissingFieldPolicy(false, "5")); // Replace missing 2780 } 2781 2782 /* MaxOperator output the maximum value for the field. This is a numeric operator. 2783 */ 2784 class MaxOperator : SingleFieldOperator 2785 { 2786 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2787 { 2788 super("max", fieldIndex, missingPolicy); 2789 } 2790 2791 final override SingleFieldCalculator makeCalculator() 2792 { 2793 return new MaxCalculator(fieldIndex); 2794 } 2795 2796 class MaxCalculator : SingleFieldCalculator 2797 { 2798 private bool _isFirst = true; 2799 private double _value = double.nan; 2800 2801 this(size_t fieldIndex) 2802 { 2803 super(fieldIndex); 2804 } 2805 2806 final override MaxOperator getOperator() 2807 { 2808 return this.outer; 2809 } 2810 2811 final override void processNextField(const char[] nextField) 2812 { 2813 double fieldValue = nextField.to!double; 2814 if (_isFirst) 2815 { 2816 _value = fieldValue; 2817 _isFirst = false; 2818 } 2819 else if (fieldValue > _value) 2820 { 2821 _value = fieldValue; 2822 } 2823 } 2824 2825 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2826 { 2827 return printOptions.formatNumber(_value); 2828 } 2829 } 2830 } 2831 2832 unittest // MaxOperator 2833 { 2834 auto col1File = [["10"], ["9.5"], ["11"]]; 2835 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 2836 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 2837 2838 testSingleFieldOperator!MaxOperator(col1File, 0, "max", ["nan", "10", "10", "11"]); 2839 testSingleFieldOperator!MaxOperator(col2File, 0, "max", ["nan", "20", "21", "22"]); 2840 testSingleFieldOperator!MaxOperator(col2File, 1, "max", ["nan", "-30", "-29", "-29"]); 2841 testSingleFieldOperator!MaxOperator(col3File, 0, "max", ["nan", "9009", "9009", "9009"]); 2842 testSingleFieldOperator!MaxOperator(col3File, 1, "max", ["nan", "9", "9", "9"]); 2843 testSingleFieldOperator!MaxOperator(col3File, 2, "max", ["nan", "-4.5", "-0.5", "12"]); 2844 2845 auto col1misFile = [[""], ["-10"], ["10"]]; 2846 testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "nan", "-10", "10"], 2847 new MissingFieldPolicy(true, "")); // Exclude missing 2848 testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "5", "5", "10"], 2849 new MissingFieldPolicy(false, "5")); // Replace missing 2850 } 2851 2852 /* RangeOperator outputs the difference between the minimum and maximum values. If there 2853 * is a single value, or all values are the same, the range is zero. This is a numeric 2854 * operator. 2855 */ 2856 class RangeOperator : SingleFieldOperator 2857 { 2858 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2859 { 2860 super("range", fieldIndex, missingPolicy); 2861 } 2862 2863 final override SingleFieldCalculator makeCalculator() 2864 { 2865 return new RangeCalculator(fieldIndex); 2866 } 2867 2868 class RangeCalculator : SingleFieldCalculator 2869 { 2870 private bool _isFirst = true; 2871 private double _minValue = 0.0; 2872 private double _maxValue = 0.0; 2873 2874 this(size_t fieldIndex) 2875 { 2876 super(fieldIndex); 2877 } 2878 2879 final override RangeOperator getOperator() 2880 { 2881 return this.outer; 2882 } 2883 2884 final override void processNextField(const char[] nextField) 2885 { 2886 double fieldValue = nextField.to!double; 2887 if (_isFirst) 2888 { 2889 _minValue = _maxValue = fieldValue; 2890 _isFirst = false; 2891 } 2892 else if (fieldValue > _maxValue) 2893 { 2894 _maxValue = fieldValue; 2895 } 2896 else if (fieldValue < _minValue) 2897 { 2898 _minValue = fieldValue; 2899 } 2900 } 2901 2902 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2903 { 2904 return printOptions.formatNumber(_maxValue - _minValue); 2905 } 2906 } 2907 } 2908 2909 unittest // RangeOperator 2910 { 2911 auto col1File = [["10"], ["9.5"], ["11"]]; 2912 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 2913 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 2914 2915 testSingleFieldOperator!RangeOperator(col1File, 0, "range", ["0", "0", "0.5", "1.5"]); 2916 testSingleFieldOperator!RangeOperator(col2File, 0, "range", ["0", "0", "1", "2"]); 2917 testSingleFieldOperator!RangeOperator(col2File, 1, "range", ["0", "0", "1", "2"]); 2918 testSingleFieldOperator!RangeOperator(col3File, 0, "range", ["0", "0", "8810", "8810"]); 2919 testSingleFieldOperator!RangeOperator(col3File, 1, "range", ["0", "0", "9", "9"]); 2920 testSingleFieldOperator!RangeOperator(col3File, 2, "range", ["0", "0", "4", "16.5"]); 2921 2922 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]]; 2923 testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "0", "0", "0.5", "1.5"], 2924 new MissingFieldPolicy(true, "")); // Exclude missing 2925 testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "4.5", "4.5", "4.5", "5.5"], 2926 new MissingFieldPolicy(false, "5.5")); // Replace missing 2927 } 2928 2929 /* SumOperator produces the sum of all the values. This is a numeric operator. 2930 */ 2931 class SumOperator : SingleFieldOperator 2932 { 2933 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2934 { 2935 super("sum", fieldIndex, missingPolicy); 2936 } 2937 2938 final override SingleFieldCalculator makeCalculator() 2939 { 2940 return new SumCalculator(fieldIndex); 2941 } 2942 2943 class SumCalculator : SingleFieldCalculator 2944 { 2945 private double _total = 0.0; 2946 2947 this(size_t fieldIndex) 2948 { 2949 super(fieldIndex); 2950 } 2951 2952 final override SumOperator getOperator() 2953 { 2954 return this.outer; 2955 } 2956 2957 final override void processNextField(const char[] nextField) 2958 { 2959 _total += nextField.to!double; 2960 } 2961 2962 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2963 { 2964 return printOptions.formatNumber(_total); 2965 } 2966 } 2967 } 2968 2969 unittest // SumOperator 2970 { 2971 auto col1File = [["10"], ["9.5"], ["11"]]; 2972 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 2973 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 2974 2975 testSingleFieldOperator!SumOperator(col1File, 0, "sum", ["0", "10", "19.5", "30.5"]); 2976 testSingleFieldOperator!SumOperator(col2File, 0, "sum", ["0", "20", "41", "63"]); 2977 testSingleFieldOperator!SumOperator(col2File, 1, "sum", ["0", "-30", "-59", "-90"]); 2978 testSingleFieldOperator!SumOperator(col3File, 0, "sum", ["0", "9009", "9208", "12211"]); 2979 testSingleFieldOperator!SumOperator(col3File, 1, "sum", ["0", "9", "9", "9.2"]); 2980 testSingleFieldOperator!SumOperator(col3File, 2, "sum", ["0", "-4.5", "-5", "7"]); 2981 2982 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]]; 2983 testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "0", "10", "10", "19.5", "30.5"], 2984 new MissingFieldPolicy(true, "")); // Exclude missing 2985 testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "1.5", "11.5", "13", "22.5", "33.5"], 2986 new MissingFieldPolicy(false, "1.5")); // Replace missing 2987 } 2988 2989 /* MeanOperator produces the mean (average) of all the values. This is a numeric operator. 2990 */ 2991 class MeanOperator : SingleFieldOperator 2992 { 2993 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2994 { 2995 super("mean", fieldIndex, missingPolicy); 2996 } 2997 2998 final override SingleFieldCalculator makeCalculator() 2999 { 3000 return new MeanCalculator(fieldIndex); 3001 } 3002 3003 class MeanCalculator : SingleFieldCalculator 3004 { 3005 private double _total = 0.0; 3006 private size_t _count = 0; 3007 3008 this(size_t fieldIndex) 3009 { 3010 super(fieldIndex); 3011 } 3012 3013 final override MeanOperator getOperator() 3014 { 3015 return this.outer; 3016 } 3017 3018 final override void processNextField(const char[] nextField) 3019 { 3020 _total += nextField.to!double; 3021 _count++; 3022 } 3023 3024 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3025 { 3026 return printOptions.formatNumber( 3027 (_count > 0) ? (_total / _count.to!double) : double.nan); 3028 } 3029 } 3030 } 3031 3032 unittest // MeanOperator 3033 { 3034 auto col1File = [["10"], ["9.5"], ["7.5"]]; 3035 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3036 auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]]; 3037 3038 testSingleFieldOperator!MeanOperator(col1File, 0, "mean", ["nan", "10", "9.75", "9"]); 3039 testSingleFieldOperator!MeanOperator(col2File, 0, "mean", ["nan", "20", "20.5", "21"]); 3040 testSingleFieldOperator!MeanOperator(col2File, 1, "mean", ["nan", "-30", "-29.5", "-30"]); 3041 testSingleFieldOperator!MeanOperator(col3File, 0, "mean", ["nan", "9009", "4509", "4509"]); 3042 testSingleFieldOperator!MeanOperator(col3File, 1, "mean", ["nan", "9", "4.5", "2"]); 3043 testSingleFieldOperator!MeanOperator(col3File, 2, "mean", ["nan", "-4.5", "-3", "2"]); 3044 3045 auto col1misFile = [[""], ["6"], [""], ["14"], ["40"]]; 3046 testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "nan", "6", "6", "10", "20"], 3047 new MissingFieldPolicy(true, "")); // Exclude missing 3048 testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "0", "3", "2", "5", "12"], 3049 new MissingFieldPolicy(false, "0")); // Replace missing 3050 } 3051 3052 /* MedianOperator produces the median of all the values. This is a numeric operator. 3053 * 3054 * All the field values are stored in memory as part of this calculation. This is 3055 * handled by unique key value lists. 3056 */ 3057 class MedianOperator : SingleFieldOperator 3058 { 3059 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3060 { 3061 super("median", fieldIndex, missingPolicy); 3062 setSaveFieldValuesNumeric(); 3063 } 3064 3065 final override SingleFieldCalculator makeCalculator() 3066 { 3067 return new MedianCalculator(fieldIndex); 3068 } 3069 3070 class MedianCalculator : SingleFieldCalculator 3071 { 3072 this(size_t fieldIndex) 3073 { 3074 super(fieldIndex); 3075 } 3076 3077 final override MedianOperator getOperator() 3078 { 3079 return this.outer; 3080 } 3081 3082 /* Work is done by saving the field values. */ 3083 final override void processNextField(const char[] nextField) 3084 { } 3085 3086 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3087 { 3088 return printOptions.formatNumber(valuesLists.numericValuesMedian(fieldIndex)); 3089 } 3090 } 3091 } 3092 3093 unittest // MedianOperator 3094 { 3095 auto col1File = [["10"], ["9.5"], ["7.5"]]; 3096 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3097 auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]]; 3098 3099 testSingleFieldOperator!MedianOperator(col1File, 0, "median", ["nan", "10", "9.75", "9.5"]); 3100 testSingleFieldOperator!MedianOperator(col2File, 0, "median", ["nan", "20", "20.5", "21"]); 3101 testSingleFieldOperator!MedianOperator(col2File, 1, "median", ["nan", "-30", "-29.5", "-30"]); 3102 testSingleFieldOperator!MedianOperator(col3File, 0, "median", ["nan", "9009", "4509", "4509"]); 3103 testSingleFieldOperator!MedianOperator(col3File, 1, "median", ["nan", "9", "4.5", "0"]); 3104 testSingleFieldOperator!MedianOperator(col3File, 2, "median", ["nan", "-4.5", "-3", "-1.5"]); 3105 3106 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]]; 3107 testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "nan", "10", "10", "9.75", "9.5"], 3108 new MissingFieldPolicy(true, "")); // Exclude missing 3109 testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "0", "5", "0", "4.75", "7.5"], 3110 new MissingFieldPolicy(false, "0")); // Replace missing 3111 } 3112 3113 /* QuantileOperator produces the value representing the data at a cummulative probability. 3114 * This is a numeric operation. 3115 * 3116 * As an example, quantiles might be produced for the 0.25, 0.5, and 0.75 probabilities 3117 * (alternately, the 25th, 50th, and 75th percentile ranks, the 50th percentile being the 3118 * median). Data is sorted is ascending order. This operator takes one percentile, but it 3119 * is common to generate multiple quantile ranks for the same field when summarizing. 3120 * 3121 * All the field's values are stored in memory as part of this calculation. This is 3122 * handled by unique key value lists. 3123 */ 3124 class QuantileOperator : SingleFieldOperator 3125 { 3126 private double _prob; 3127 3128 this(size_t fieldIndex, MissingFieldPolicy missingPolicy, double probability) 3129 { 3130 assert(0.0 <= probability && probability <= 1.0); 3131 import std.format : format; 3132 3133 string header = (probability == 0.0) ? "pct0" : format("pct%02g", probability * 100.0); 3134 super(header, fieldIndex, missingPolicy); 3135 _prob = probability; 3136 setSaveFieldValuesNumeric(); 3137 } 3138 3139 final override SingleFieldCalculator makeCalculator() 3140 { 3141 return new QuantileCalculator(fieldIndex); 3142 } 3143 3144 class QuantileCalculator : SingleFieldCalculator 3145 { 3146 this(size_t fieldIndex) 3147 { 3148 super(fieldIndex); 3149 } 3150 3151 final override QuantileOperator getOperator() 3152 { 3153 return this.outer; 3154 } 3155 3156 /* Work is done by saving the field values. */ 3157 final override void processNextField(const char[] nextField) 3158 { } 3159 3160 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3161 { 3162 import tsv_numerics : quantile; 3163 return printOptions.formatNumber( 3164 quantile(this.outer._prob, valuesLists.numericValuesSorted(fieldIndex))); 3165 } 3166 } 3167 } 3168 3169 unittest // QuantileOperator 3170 { 3171 auto col1File = [["10"], ["9.5"], ["7.5"]]; 3172 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3173 auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]]; 3174 3175 auto defaultMissing = new MissingFieldPolicy; 3176 3177 /* Same as the median tests. */ 3178 testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct50", ["nan", "10", "9.75", "9.5"], defaultMissing, 0.50); 3179 testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct50", ["nan", "20", "20.5", "21"], defaultMissing, 0.50); 3180 testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct50", ["nan", "-30", "-29.5", "-30"], defaultMissing, 0.50); 3181 testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct50", ["nan", "9009", "4509", "4509"], defaultMissing, 0.50); 3182 testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct50", ["nan", "9", "4.5", "0"], defaultMissing, 0.50); 3183 testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct50", ["nan", "-4.5", "-3", "-1.5"], defaultMissing, 0.50); 3184 3185 /* The extremes (0, 1), are min and max. */ 3186 testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct0", ["nan", "10", "9.5", "7.5"], defaultMissing, 0.0); 3187 testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct0", ["nan", "20", "20", "20"], defaultMissing, 0.0); 3188 testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct0", ["nan", "-30", "-30", "-31"], defaultMissing, 0.0); 3189 testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct0", ["nan", "9009", "9", "9"], defaultMissing, 0.0); 3190 testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct0", ["nan", "9", "0", "-3"], defaultMissing, 0.0); 3191 testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct0", ["nan", "-4.5", "-4.5", "-4.5"], defaultMissing, 0.0); 3192 3193 testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct100", ["nan", "10", "10", "10"], defaultMissing, 1.0); 3194 testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct100", ["nan", "20", "21", "22"], defaultMissing, 1.0); 3195 testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct100", ["nan", "-30", "-29", "-29"], defaultMissing, 1.0); 3196 testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct100", ["nan", "9009", "9009", "9009"], defaultMissing, 1.0); 3197 testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct100", ["nan", "9", "9", "9"], defaultMissing, 1.0); 3198 testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct100", ["nan", "-4.5", "-1.5", "12"], defaultMissing, 1.0); 3199 3200 /* For missing policies, re-use the median tests. */ 3201 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]]; 3202 testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "nan", "10", "10", "9.75", "9.5"], 3203 new MissingFieldPolicy(true, ""), 0.5); // Exclude missing 3204 testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "0", "5", "0", "4.75", "7.5"], 3205 new MissingFieldPolicy(false, "0"), 0.5); // Replace missing 3206 } 3207 3208 /* MadOperator produces the median absolute deviation from the median. This is a numeric 3209 * operation. 3210 * 3211 * The result is the raw MAD value, without a normalization applied. 3212 * 3213 * All the field values are stored in memory as part of this calculation. This is 3214 * handled by unique key value lists. 3215 */ 3216 class MadOperator : SingleFieldOperator 3217 { 3218 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3219 { 3220 super("mad", fieldIndex, missingPolicy); 3221 setSaveFieldValuesNumeric(); 3222 } 3223 3224 final override SingleFieldCalculator makeCalculator() 3225 { 3226 return new MadCalculator(fieldIndex); 3227 } 3228 3229 class MadCalculator : SingleFieldCalculator 3230 { 3231 this(size_t fieldIndex) 3232 { 3233 super(fieldIndex); 3234 } 3235 3236 final override MadOperator getOperator() 3237 { 3238 return this.outer; 3239 } 3240 3241 /* Work is done by saving the field values. */ 3242 final override void processNextField(const char[] nextField) 3243 { } 3244 3245 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3246 { 3247 import std.math : abs; 3248 import tsv_numerics : rangeMedian; 3249 3250 auto median = valuesLists.numericValuesMedian(fieldIndex); 3251 auto values = valuesLists.numericValues(fieldIndex); 3252 auto medianDevs = new double[values.length]; 3253 foreach (int i, double v; values) 3254 medianDevs[i] = abs(v - median); 3255 3256 return printOptions.formatNumber(medianDevs.rangeMedian); 3257 } 3258 } 3259 } 3260 3261 unittest // MadOperator 3262 { 3263 auto col1File = [["10"], ["15"], ["20"], ["25"], ["30"]]; 3264 auto col2File = [["2", "50"], ["2", "51"], ["2", "52"]]; 3265 auto col3File = [["16", "8", "-4"], ["8", "8", "-2"], ["8", "16", "0"]]; 3266 3267 testSingleFieldOperator!MadOperator(col1File, 0, "mad", ["nan", "0", "2.5", "5", "5", "5"]); 3268 testSingleFieldOperator!MadOperator(col2File, 0, "mad", ["nan", "0", "0", "0"]); 3269 testSingleFieldOperator!MadOperator(col2File, 1, "mad", ["nan", "0", "0.5", "1"]); 3270 testSingleFieldOperator!MadOperator(col3File, 0, "mad", ["nan", "0", "4", "0"]); 3271 testSingleFieldOperator!MadOperator(col3File, 1, "mad", ["nan", "0", "0", "0"]); 3272 testSingleFieldOperator!MadOperator(col3File, 2, "mad", ["nan", "0", "1", "2"]); 3273 3274 auto col1misFile = [[""], ["16"], [""], ["32"], ["-4"]]; 3275 testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "nan", "0", "0", "8", "16"], 3276 new MissingFieldPolicy(true, "")); // Exclude missing 3277 testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "0", "8", "0", "8", "4"], 3278 new MissingFieldPolicy(false, "0")); // Replace missing 3279 } 3280 3281 class VarianceOperator : SingleFieldOperator 3282 { 3283 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3284 { 3285 super("var", fieldIndex, missingPolicy); 3286 } 3287 3288 final override SingleFieldCalculator makeCalculator() 3289 { 3290 return new VarianceCalculator(fieldIndex); 3291 } 3292 3293 class VarianceCalculator : SingleFieldCalculator 3294 { 3295 private double _count = 0.0; 3296 private double _mean = 0.0; 3297 private double _m2 = 0.0; // Sum of squares of differences from current mean 3298 3299 this(size_t fieldIndex) 3300 { 3301 super(fieldIndex); 3302 } 3303 3304 final override VarianceOperator getOperator() 3305 { 3306 return this.outer; 3307 } 3308 3309 final override void processNextField(const char[] nextField) 3310 { 3311 _count += 1.0; 3312 double fieldValue = nextField.to!double; 3313 double delta = fieldValue - _mean; 3314 _mean += delta / _count; 3315 _m2 += delta * (fieldValue - _mean); 3316 } 3317 3318 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3319 { 3320 return printOptions.formatNumber( 3321 (_count >= 2.0) ? (_m2 / (_count - 1.0)) : double.nan); 3322 } 3323 } 3324 } 3325 3326 unittest // VarianceOperator 3327 { 3328 auto col1File = [["5"], ["10"], ["15"]]; 3329 auto col2File = [["-5", "-5"], ["-10", "0"], ["-15", "5"]]; 3330 auto col3File = [["1", "2", "100"], ["2", "3", "100"], ["3", "4", "103"]]; 3331 3332 testSingleFieldOperator!VarianceOperator(col1File, 0, "var", ["nan", "nan", "12.5", "25"]); 3333 testSingleFieldOperator!VarianceOperator(col2File, 0, "var", ["nan", "nan", "12.5", "25"]); 3334 testSingleFieldOperator!VarianceOperator(col2File, 1, "var", ["nan", "nan", "12.5", "25"]); 3335 testSingleFieldOperator!VarianceOperator(col3File, 0, "var", ["nan", "nan", "0.5", "1"]); 3336 testSingleFieldOperator!VarianceOperator(col3File, 1, "var", ["nan", "nan", "0.5", "1"]); 3337 testSingleFieldOperator!VarianceOperator(col3File, 2, "var", ["nan", "nan", "0", "3"]); 3338 3339 auto col1misFile = [["5"], ["10"], [""]]; 3340 testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "12.5"], 3341 new MissingFieldPolicy(true, "")); // Exclude missing 3342 testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "25"], 3343 new MissingFieldPolicy(false, "15")); // Replace missing 3344 } 3345 3346 class StDevOperator : SingleFieldOperator 3347 { 3348 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3349 { 3350 super("stdev", fieldIndex, missingPolicy); 3351 } 3352 3353 final override SingleFieldCalculator makeCalculator() 3354 { 3355 return new StDevCalculator(fieldIndex); 3356 } 3357 3358 class StDevCalculator : SingleFieldCalculator 3359 { 3360 private double _count = 0.0; 3361 private double _mean = 0.0; 3362 private double _m2 = 0.0; // Sum of squares of differences from current mean 3363 3364 this(size_t fieldIndex) 3365 { 3366 super(fieldIndex); 3367 } 3368 3369 final override StDevOperator getOperator() 3370 { 3371 return this.outer; 3372 } 3373 3374 final override void processNextField(const char[] nextField) 3375 { 3376 _count += 1.0; 3377 double fieldValue = nextField.to!double; 3378 double delta = fieldValue - _mean; 3379 _mean += delta / _count; 3380 _m2 += delta * (fieldValue - _mean); 3381 } 3382 3383 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3384 { 3385 import std.math : sqrt; 3386 return printOptions.formatNumber( 3387 (_count >= 2.0) ? (_m2 / (_count - 1.0)).sqrt : double.nan); 3388 } 3389 } 3390 } 3391 3392 /* StDevOperator unit tests - These would be improved with a tolerance option. 3393 */ 3394 unittest 3395 { 3396 auto col1File = [["1"], ["4"], ["7"]]; 3397 auto col2File = [["3", "3"], ["3", "9"], ["7", "15"]]; 3398 auto col3File = [["11", "10", "10"], ["24", "22", "25"], ["37", "34", "40"]]; 3399 3400 testSingleFieldOperator!StDevOperator(col1File, 0, "stdev", ["nan", "nan", "2.12132034356", "3"]); 3401 testSingleFieldOperator!StDevOperator(col2File, 0, "stdev", ["nan", "nan", "0", "2.30940107676"]); 3402 testSingleFieldOperator!StDevOperator(col2File, 1, "stdev", ["nan", "nan", "4.24264068712", "6"]); 3403 testSingleFieldOperator!StDevOperator(col3File, 0, "stdev", ["nan", "nan", "9.19238815543", "13"]); 3404 testSingleFieldOperator!StDevOperator(col3File, 1, "stdev", ["nan", "nan", "8.48528137424", "12"]); 3405 testSingleFieldOperator!StDevOperator(col3File, 2, "stdev", ["nan", "nan", "10.6066017178", "15"]); 3406 3407 auto col1misFile = [["1"], ["4"], [""]]; 3408 testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "2.12132034356"], 3409 new MissingFieldPolicy(true, "")); // Exclude missing 3410 testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "3"], 3411 new MissingFieldPolicy(false, "7")); // Replace missing 3412 } 3413 3414 /* UniqueCountOperator generates the number of unique values. Unique values are 3415 * based on exact text match calculation, not a numeric comparison. 3416 * 3417 * All the unique field values are stored in memory as part of this calculation. 3418 */ 3419 class UniqueCountOperator : SingleFieldOperator 3420 { 3421 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3422 { 3423 super("unique_count", fieldIndex, missingPolicy); 3424 } 3425 3426 final override SingleFieldCalculator makeCalculator() 3427 { 3428 return new UniqueCountCalculator(fieldIndex); 3429 } 3430 3431 class UniqueCountCalculator : SingleFieldCalculator 3432 { 3433 private bool[string] _values; 3434 3435 this(size_t fieldIndex) 3436 { 3437 super(fieldIndex); 3438 } 3439 3440 final override UniqueCountOperator getOperator() 3441 { 3442 return this.outer; 3443 } 3444 3445 final override void processNextField(const char[] nextField) 3446 { 3447 if (nextField !in _values) _values[nextField.to!string] = true; 3448 } 3449 3450 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3451 { 3452 return printOptions.formatNumber(_values.length); 3453 } 3454 } 3455 } 3456 3457 unittest // UniqueCount 3458 { 3459 auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]]; 3460 auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]]; 3461 auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]]; 3462 3463 testSingleFieldOperator!UniqueCountOperator(col1File, 0, "unique_count", ["0", "1", "2", "3", "3", "3", "3", "3", "4"]); 3464 testSingleFieldOperator!UniqueCountOperator(col2File, 0, "unique_count", ["0", "1", "2", "2"]); 3465 testSingleFieldOperator!UniqueCountOperator(col2File, 1, "unique_count", ["0", "1", "1", "2"]); 3466 testSingleFieldOperator!UniqueCountOperator(col3File, 0, "unique_count", ["0", "1", "2", "3"]); 3467 testSingleFieldOperator!UniqueCountOperator(col3File, 1, "unique_count", ["0", "1", "2", "2"]); 3468 testSingleFieldOperator!UniqueCountOperator(col3File, 2, "unique_count", ["0", "1", "2", "3"]); 3469 3470 auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]]; 3471 testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "0", "1", "1", "2", "3", "3", "3", "3", "3", "4"], 3472 new MissingFieldPolicy(true, "")); // Exclude missing 3473 3474 3475 testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "1", "2", "2", "3", "4", "4", "4", "4", "4", "5"], 3476 new MissingFieldPolicy(false, "XYZ")); // Replace missing 3477 } 3478 3479 /* MissingCountOperator generates the number of missing values. This overrides 3480 * the global missingFieldsPolicy. 3481 */ 3482 class MissingCountOperator : SingleFieldOperator 3483 { 3484 private MissingFieldPolicy _globalMissingPolicy; 3485 3486 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3487 { 3488 _globalMissingPolicy = missingPolicy; 3489 super("missing_count", fieldIndex, new MissingFieldPolicy(false, "")); 3490 } 3491 3492 final override SingleFieldCalculator makeCalculator() 3493 { 3494 return new MissingCountCalculator(fieldIndex); 3495 } 3496 3497 class MissingCountCalculator : SingleFieldCalculator 3498 { 3499 private size_t _missingCount = 0; 3500 3501 this(size_t fieldIndex) 3502 { 3503 super(fieldIndex); 3504 } 3505 3506 final override MissingCountOperator getOperator() 3507 { 3508 return this.outer; 3509 } 3510 3511 final override void processNextField(const char[] nextField) 3512 { 3513 if (this.outer._globalMissingPolicy.isMissingField(nextField)) _missingCount++; 3514 } 3515 3516 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3517 { 3518 return printOptions.formatNumber(_missingCount); 3519 } 3520 } 3521 } 3522 3523 unittest // MissingCount 3524 { 3525 auto col1File = [["a"], ["b"], [""], [" "], [""]]; 3526 auto col2File = [["abc", ""], ["", ""], ["def", ""]]; 3527 auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]]; 3528 3529 testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"]); 3530 testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"]); 3531 testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"]); 3532 testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"]); 3533 testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"]); 3534 testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"]); 3535 3536 auto excludeMissing = new MissingFieldPolicy(true, ""); 3537 auto replaceMissing = new MissingFieldPolicy(false, "X"); 3538 3539 testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], excludeMissing); 3540 testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], excludeMissing); 3541 testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], excludeMissing); 3542 testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], excludeMissing); 3543 testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], excludeMissing); 3544 testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], excludeMissing); 3545 3546 testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], replaceMissing); 3547 testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], replaceMissing); 3548 testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], replaceMissing); 3549 testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], replaceMissing); 3550 testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], replaceMissing); 3551 testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], replaceMissing); 3552 } 3553 3554 /* NotMissingCountOperator generates the number of not-missing values. This overrides 3555 * the global missingFieldsPolicy. 3556 */ 3557 class NotMissingCountOperator : SingleFieldOperator 3558 { 3559 private MissingFieldPolicy _globalMissingPolicy; 3560 3561 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3562 { 3563 _globalMissingPolicy = missingPolicy; 3564 super("not_missing_count", fieldIndex, new MissingFieldPolicy(false, "")); 3565 } 3566 3567 final override SingleFieldCalculator makeCalculator() 3568 { 3569 return new NotMissingCountCalculator(fieldIndex); 3570 } 3571 3572 class NotMissingCountCalculator : SingleFieldCalculator 3573 { 3574 private size_t _notMissingCount = 0; 3575 3576 this(size_t fieldIndex) 3577 { 3578 super(fieldIndex); 3579 } 3580 3581 final override NotMissingCountOperator getOperator() 3582 { 3583 return this.outer; 3584 } 3585 3586 final override void processNextField(const char[] nextField) 3587 { 3588 if (!this.outer._globalMissingPolicy.isMissingField(nextField)) _notMissingCount++; 3589 } 3590 3591 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3592 { 3593 return printOptions.formatNumber(_notMissingCount); 3594 } 3595 } 3596 } 3597 3598 unittest // NotMissingCount 3599 { 3600 auto col1File = [["a"], ["b"], [""], [" "], [""]]; 3601 auto col2File = [["abc", ""], ["", ""], ["def", ""]]; 3602 auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]]; 3603 3604 testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"]); 3605 testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"]); 3606 testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"]); 3607 testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"]); 3608 testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"]); 3609 testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"]); 3610 3611 auto excludeMissing = new MissingFieldPolicy(true, ""); 3612 auto replaceMissing = new MissingFieldPolicy(false, "X"); 3613 3614 testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], excludeMissing); 3615 testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], excludeMissing); 3616 testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], excludeMissing); 3617 testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], excludeMissing); 3618 testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], excludeMissing); 3619 testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], excludeMissing); 3620 3621 testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], replaceMissing); 3622 testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], replaceMissing); 3623 testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], replaceMissing); 3624 testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], replaceMissing); 3625 testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], replaceMissing); 3626 testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], replaceMissing); 3627 } 3628 3629 /* ModeOperator outputs the most frequent value seen. In the event of a tie, the 3630 * first value seen is produced. 3631 * 3632 * All the field values are stored in memory as part of this calculation. 3633 * 3634 */ 3635 class ModeOperator : SingleFieldOperator 3636 { 3637 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3638 { 3639 super("mode", fieldIndex, missingPolicy); 3640 } 3641 3642 final override SingleFieldCalculator makeCalculator() 3643 { 3644 return new ModeCalculator(fieldIndex); 3645 } 3646 3647 class ModeCalculator : SingleFieldCalculator 3648 { 3649 private size_t[string] _valueCounts; 3650 private Appender!(string[]) _uniqueValues; 3651 3652 this(size_t fieldIndex) 3653 { 3654 super(fieldIndex); 3655 } 3656 3657 final override ModeOperator getOperator() 3658 { 3659 return this.outer; 3660 } 3661 3662 final override void processNextField(const char[] nextField) 3663 { 3664 auto countPtr = (nextField in _valueCounts); 3665 3666 if (countPtr is null) 3667 { 3668 string value = nextField.to!string; 3669 _uniqueValues.put(value); 3670 _valueCounts[value] = 1; 3671 } 3672 else 3673 { 3674 (*countPtr)++; 3675 } 3676 } 3677 3678 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3679 { 3680 string modeValue = ""; 3681 size_t modeCount = 0; 3682 3683 foreach (value; _uniqueValues.data) 3684 { 3685 assert(value in _valueCounts); 3686 3687 auto count = _valueCounts[value]; 3688 3689 if (count > modeCount) 3690 { 3691 modeValue = value; 3692 modeCount = count; 3693 } 3694 } 3695 3696 return modeValue; 3697 } 3698 } 3699 } 3700 3701 unittest // ModeOperator 3702 { 3703 auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]]; 3704 auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]]; 3705 auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]]; 3706 3707 testSingleFieldOperator!ModeOperator(col1File, 0, "mode", ["", "a", "a", "a", "c", "b", "b", "b"]); 3708 testSingleFieldOperator!ModeOperator(col2File, 0, "mode", ["", "abc", "abc", "def"]); 3709 testSingleFieldOperator!ModeOperator(col2File, 1, "mode", ["", "pqr", "pqr", "pqr"]); 3710 testSingleFieldOperator!ModeOperator(col3File, 0, "mode", ["", "1.0", "1.0", "1.0"]); 3711 testSingleFieldOperator!ModeOperator(col3File, 1, "mode", ["", "1", "1", "a"]); 3712 testSingleFieldOperator!ModeOperator(col3File, 2, "mode", ["", "a", "a", "a"]); 3713 3714 auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]]; 3715 testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "", "a", "a", "a", "a", "c", "b", "b"], 3716 new MissingFieldPolicy(true, "")); // Exclude missing 3717 3718 3719 testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "X", "X", "X", "X", "X", "X", "X", "b"], 3720 new MissingFieldPolicy(false, "X")); // Replace missing 3721 } 3722 3723 /* ModeCountOperator outputs the count of the most frequent value seen. 3724 * 3725 * All the field values are stored in memory as part of this calculation. 3726 * 3727 */ 3728 class ModeCountOperator : SingleFieldOperator 3729 { 3730 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3731 { 3732 super("mode_count", fieldIndex, missingPolicy); 3733 } 3734 3735 final override SingleFieldCalculator makeCalculator() 3736 { 3737 return new ModeCountCalculator(fieldIndex); 3738 } 3739 3740 class ModeCountCalculator : SingleFieldCalculator 3741 { 3742 private size_t[string] _valueCounts; 3743 3744 this(size_t fieldIndex) 3745 { 3746 super(fieldIndex); 3747 } 3748 3749 final override ModeCountOperator getOperator() 3750 { 3751 return this.outer; 3752 } 3753 3754 final override void processNextField(const char[] nextField) 3755 { 3756 auto countPtr = (nextField in _valueCounts); 3757 3758 if (countPtr is null) 3759 { 3760 string value = nextField.to!string; 3761 _valueCounts[value] = 1; 3762 } 3763 else 3764 { 3765 (*countPtr)++; 3766 } 3767 } 3768 3769 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3770 { 3771 size_t modeCount = 0; 3772 foreach (count; _valueCounts.byValue) if (count > modeCount) modeCount = count; 3773 return printOptions.formatNumber(modeCount); 3774 } 3775 } 3776 } 3777 3778 unittest // ModeCountOperator 3779 { 3780 auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]]; 3781 auto col2File = [["abc", ""], ["def", ""], ["def", "xyz"]]; 3782 auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]]; 3783 3784 testSingleFieldOperator!ModeCountOperator(col1File, 0, "mode_count", ["0", "1", "1", "1", "2", "2", "3", "3"]); 3785 testSingleFieldOperator!ModeCountOperator(col2File, 0, "mode_count", ["0", "1", "1", "2"]); 3786 testSingleFieldOperator!ModeCountOperator(col2File, 1, "mode_count", ["0", "1", "2", "2"]); 3787 testSingleFieldOperator!ModeCountOperator(col3File, 0, "mode_count", ["0", "1", "1", "1"]); 3788 testSingleFieldOperator!ModeCountOperator(col3File, 1, "mode_count", ["0", "1", "1", "2"]); 3789 testSingleFieldOperator!ModeCountOperator(col3File, 2, "mode_count", ["0", "1", "1", "1"]); 3790 3791 auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]]; 3792 testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "0", "1", "1", "1", "1", "2", "2", "3"], 3793 new MissingFieldPolicy(true, "")); // Exclude missing 3794 3795 3796 testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "1", "1", "2", "2", "2", "2", "2", "3"], 3797 new MissingFieldPolicy(false, "X")); // Replace missing 3798 } 3799 3800 /* ValuesOperator outputs each value delimited by an alternate delimiter character. 3801 * 3802 * All the field values are stored in memory as part of this calculation. This is 3803 * handled by unique key value lists. 3804 */ 3805 3806 class ValuesOperator : SingleFieldOperator 3807 { 3808 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3809 { 3810 super("values", fieldIndex, missingPolicy); 3811 setSaveFieldValuesText(); 3812 } 3813 3814 final override SingleFieldCalculator makeCalculator() 3815 { 3816 return new ValuesCalculator(fieldIndex); 3817 } 3818 3819 class ValuesCalculator : SingleFieldCalculator 3820 { 3821 this(size_t fieldIndex) 3822 { 3823 super(fieldIndex); 3824 } 3825 3826 final override ValuesOperator getOperator() 3827 { 3828 return this.outer; 3829 } 3830 3831 /* Work is done by saving the field values. */ 3832 final override void processNextField(const char[] nextField) 3833 { } 3834 3835 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3836 { 3837 return valuesLists.textValues(fieldIndex).join(printOptions.valuesDelimiter); 3838 } 3839 } 3840 } 3841 3842 unittest // ValuesOperator 3843 { 3844 auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]]; 3845 auto col2File = [["", "50"], ["", "51"], ["xyz", "52"]]; 3846 auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "---"]]; 3847 3848 testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e|", "a||b|cd|e||a"]); 3849 testSingleFieldOperator!ValuesOperator(col2File, 0, "values", ["", "", "|", "||xyz"]); 3850 testSingleFieldOperator!ValuesOperator(col2File, 1, "values", ["", "50", "50|51", "50|51|52"]); 3851 testSingleFieldOperator!ValuesOperator(col3File, 0, "values", ["", "z", "z|y", "z|y|w"]); 3852 testSingleFieldOperator!ValuesOperator(col3File, 1, "values", ["", "a", "a|ab", "a|ab|ba"]); 3853 testSingleFieldOperator!ValuesOperator(col3File, 2, "values", ["", "-", "-|--", "-|--|---"]); 3854 3855 testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e|a"], 3856 new MissingFieldPolicy(true, "")); // Exclude missing 3857 3858 3859 testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e|X", "a|X|b|cd|e|X|a"], 3860 new MissingFieldPolicy(false, "X")); // Replace missing 3861 } 3862 3863 /* UniqueValuesOperator outputs each unique value delimited by an alternate delimiter 3864 * character. Values are output in the order seen. 3865 * 3866 * All unique field values are stored in memory as part of this calculation. 3867 * 3868 */ 3869 class UniqueValuesOperator : SingleFieldOperator 3870 { 3871 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3872 { 3873 super("unique_values", fieldIndex, missingPolicy); 3874 } 3875 3876 final override SingleFieldCalculator makeCalculator() 3877 { 3878 return new UniqueValuesCalculator(fieldIndex); 3879 } 3880 3881 class UniqueValuesCalculator : SingleFieldCalculator 3882 { 3883 private size_t[string] _valuesHash; 3884 private Appender!(string[]) _uniqueValues; 3885 3886 this(size_t fieldIndex) 3887 { 3888 super(fieldIndex); 3889 } 3890 3891 final override UniqueValuesOperator getOperator() 3892 { 3893 return this.outer; 3894 } 3895 3896 final override void processNextField(const char[] nextField) 3897 { 3898 auto ptr = (nextField in _valuesHash); 3899 3900 if (ptr is null) 3901 { 3902 string value = nextField.to!string; 3903 _uniqueValues.put(value); 3904 _valuesHash[value] = 1; 3905 } 3906 } 3907 3908 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3909 { 3910 return _uniqueValues.data.join(printOptions.valuesDelimiter); 3911 } 3912 } 3913 } 3914 3915 unittest // UniqueValuesOperator 3916 { 3917 auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]]; 3918 auto col2File = [["", "50"], ["", "50"], ["xyz", "52"]]; 3919 auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "-"]]; 3920 3921 testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e", "a||b|cd|e"]); 3922 testSingleFieldOperator!UniqueValuesOperator(col2File, 0, "unique_values", ["", "", "", "|xyz"]); 3923 testSingleFieldOperator!UniqueValuesOperator(col2File, 1, "unique_values", ["", "50", "50", "50|52"]); 3924 testSingleFieldOperator!UniqueValuesOperator(col3File, 0, "unique_values", ["", "z", "z|y", "z|y|w"]); 3925 testSingleFieldOperator!UniqueValuesOperator(col3File, 1, "unique_values", ["", "a", "a|ab", "a|ab|ba"]); 3926 testSingleFieldOperator!UniqueValuesOperator(col3File, 2, "unique_values", ["", "-", "-|--", "-|--"]); 3927 3928 testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e"], 3929 new MissingFieldPolicy(true, "")); // Exclude missing 3930 3931 3932 testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e", "a|X|b|cd|e"], 3933 new MissingFieldPolicy(false, "X")); // Replace missing 3934 }