1 /** 2 Command line tool that reads TSV files and summarizes field values associated with 3 equivalent keys. 4 5 Copyright (c) 2016-2020, eBay Inc. 6 Initially written by Jon Degenhardt 7 8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 9 */ 10 module tsv_utils.tsv_summarize; 11 12 import std.algorithm : all, any, canFind, each, find, findSplit, map, joiner, splitter; 13 import std.array : join; 14 import std.conv : to; 15 import std.format : format; 16 import std.range; 17 import std.stdio; 18 import std.typecons : tuple; 19 import std.container : DList; 20 21 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 22 23 version(unittest) 24 { 25 // When running unit tests, use main from -main compiler switch. 26 } 27 else 28 { 29 int main(string[] cmdArgs) 30 { 31 /* When running in DMD code coverage mode, turn on report merging. */ 32 version(D_Coverage) version(DigitalMars) 33 { 34 import core.runtime : dmd_coverSetMerge; 35 dmd_coverSetMerge(true); 36 } 37 38 TsvSummarizeOptions cmdopt; 39 auto r = cmdopt.processArgs(cmdArgs); 40 if (!r[0]) return r[1]; 41 version(LDC_Profile) 42 { 43 import ldc.profile : resetAll; 44 resetAll(); 45 } 46 try tsvSummarize(cmdopt, cmdArgs[1..$]); 47 catch (Exception exc) 48 { 49 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 50 return 1; 51 } 52 return 0; 53 } 54 } 55 56 auto helpTextVerbose = q"EOS 57 Synopsis: tsv-summarize [options] file [file...] 58 59 tsv-summarize reads tabular data files (tab-separated by default), tracks 60 field values for each unique key, and runs summarization algorithms. Consider 61 the file data.tsv: 62 63 make color time 64 ford blue 131 65 chevy green 124 66 ford red 128 67 bmw black 118 68 bmw black 126 69 ford blue 122 70 71 The min and average times for each make is generated by the command: 72 73 $ tsv-summarize --header --group-by 1 --min 3 --mean 3 data.tsv 74 75 This produces: 76 77 make time_min time_mean 78 ford 122 127 79 chevy 124 124 80 bmw 118 122 81 82 Using '--group 1,2' will group by both 'make' and 'color'. Omitting the 83 '--group-by' entirely summarizes fields for full file. 84 85 The program tries to generate useful headers, but custom headers can be 86 specified. Example (using -g and -H shortcuts for --header and --group-by): 87 88 $ tsv-summarize -H -g 1 --min 3:fastest --mean 3:average data.tsv 89 90 Most operators take custom headers in a similarly way, generally following: 91 92 --<operator-name> FIELD[:header] 93 94 Operators can be specified multiple times. They can also take multiple 95 fields (though not when a custom header is specified). Examples: 96 97 --median 2,3,4 98 --median 2-5,7-11 99 100 The quantile operator requires one or more probabilities after the fields: 101 102 --quantile 2:0.25 // Quantile 1 of field 2 103 --quantile 2-4:0.25,0.5,0.75 // Q1, Median, Q3 of fields 2, 3, 4 104 105 Summarization operators available are: 106 count range mad values 107 retain sum var unique-values 108 first mean stddev unique-count 109 last median mode missing-count 110 min quantile mode-count not-missing-count 111 max 112 113 Calculated numeric values are printed to 12 significant digits by default. 114 This can be changed using the '--p|float-precision' option. If six or less 115 it sets the number of significant digits after the decimal point. If 116 greater than six it sets the total number of significant digits. 117 118 Calculations hold onto the minimum data needed while reading data. A few 119 operations like median keep all data values in memory. These operations will 120 start to encounter performance issues as available memory becomes scarce. The 121 size that can be handled effectively is machine dependent, but often quite 122 large files can be handled. 123 124 Operations requiring numeric entries will signal an error and terminate 125 processing if a non-numeric entry is found. 126 127 Missing values are not treated specially by default, this can be changed 128 using the '--x|exclude-missing' or '--r|replace-missing' option. The former 129 turns off processing for missing values, the latter uses a replacement value. 130 131 Options: 132 EOS"; 133 134 auto helpText = q"EOS 135 Synopsis: tsv-summarize [options] file [file...] 136 137 tsv-summarize runs aggregation operations on fields in tab-separated value 138 files. Operations can be run against the full input data or grouped by key 139 fields. Use --help-verbose for more extensive help. 140 141 Options: 142 EOS"; 143 144 /** Command line options - Container and processing. The processArgs method is used to 145 * process the command line. 146 */ 147 struct TsvSummarizeOptions { 148 string programName; 149 150 /* Options set directly by on the command line.. */ 151 size_t[] keyFields; // -g, --group-by 152 bool hasHeader = false; // --header 153 bool writeHeader = false; // -w, --write-header 154 char inputFieldDelimiter = '\t'; // --d|delimiter 155 char valuesDelimiter = '|'; // --v|values-delimiter 156 size_t floatPrecision = 12; // --p|float-precision 157 bool excludeMissing = false; // --x|exclude-missing 158 string missingValueReplacement; // --r|replace-missing 159 bool helpVerbose = false; // --help-verbose 160 bool versionWanted = false; // --V|version 161 DList!Operator operators; // Operators, in the order specified. 162 size_t endFieldIndex = 0; // Derived value. Max field index used plus one. 163 MissingFieldPolicy globalMissingPolicy = new MissingFieldPolicy; // Derived value. 164 165 /* Returns a tuple. First value is true if command line arguments were successfully 166 * processed and execution should continue, or false if an error occurred or the user 167 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 168 * 169 * Returning true (execution continues) means args have been validated and derived 170 * values calculated. In addition, field indices have been converted to zero-based. 171 */ 172 auto processArgs (ref string[] cmdArgs) { 173 import std.algorithm : any, each; 174 import std.getopt; 175 import std.path : baseName, stripExtension; 176 import std.typecons : Yes, No; 177 import tsv_utils.common.getopt_inorder; 178 import tsv_utils.common.utils : makeFieldListOptionHandler; 179 180 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 181 182 try 183 { 184 arraySep = ","; // Use comma to separate values in command line options 185 auto r = getoptInorder( 186 cmdArgs, 187 "help-verbose", " Print full help.", &helpVerbose, 188 189 std.getopt.config.caseSensitive, 190 "V|version", " Print version information and exit.", &versionWanted, 191 std.getopt.config.caseInsensitive, 192 193 "g|group-by", "<field-list> Fields to use as key.", 194 keyFields.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex), 195 196 std.getopt.config.caseSensitive, 197 "H|header", " Treat the first line of each file as a header.", &hasHeader, 198 std.getopt.config.caseInsensitive, 199 200 "w|write-header", " Write an output header even if there is no input header.", &writeHeader, 201 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &inputFieldDelimiter, 202 "v|values-delimiter", "CHR Values delimiter. Default: vertical bar (|). (Single byte UTF-8 characters only.)", &valuesDelimiter, 203 "p|float-precision", "NUM 'Precision' to use printing floating point numbers. Affects the number of digits printed and exponent use. Default: 12", &floatPrecision, 204 "x|exclude-missing", " Exclude missing (empty) fields from calculations.", &excludeMissing, 205 "r|replace-missing", "STR Replace missing (empty) fields with STR in calculations.", &missingValueReplacement, 206 "count", " Count occurrences of each unique key ('--g|group-by'), or the total number of records if no key field is specified.", &countOptionHandler, 207 "count-header", "STR Count occurrences of each unique key, like '--count', but use STR as the header.", &countHeaderOptionHandler, 208 "retain", "<field-list> Retain one copy of the field.", &operatorOptionHandler!RetainOperator, 209 "first", "<field-list>[:STR] First value seen.", &operatorOptionHandler!FirstOperator, 210 "last", "<field-list>[:STR] Last value seen.", &operatorOptionHandler!LastOperator, 211 "min", "<field-list>[:STR] Min value. (Numeric fields only.)", &operatorOptionHandler!MinOperator, 212 "max", "<field-list>[:STR] Max value. (Numeric fields only.)", &operatorOptionHandler!MaxOperator, 213 "range", "<field-list>[:STR] Difference between min and max values. (Numeric fields only.)", &operatorOptionHandler!RangeOperator, 214 "sum", "<field-list>[:STR] Sum of the values. (Numeric fields only.)", &operatorOptionHandler!SumOperator, 215 "mean", "<field-list>[:STR] Mean (average). (Numeric fields only.)", &operatorOptionHandler!MeanOperator, 216 "median", "<field-list>[:STR] Median value. (Numeric fields only. Reads all values into memory.)", &operatorOptionHandler!MedianOperator, 217 "quantile", "<field-list>:p[,p...][:STR] Quantiles. One or more fields, then one or more 0.0-1.0 probabilities. (Numeric fields only. Reads all values into memory.)", &quantileOperatorOptionHandler, 218 "mad", "<field-list>[:STR] Median absolute deviation from the median. Raw value, not scaled. (Numeric fields only. Reads all values into memory.)", &operatorOptionHandler!MadOperator, 219 "var", "<field-list>[:STR] Variance. (Sample variance, numeric fields only).", &operatorOptionHandler!VarianceOperator, 220 "stdev", "<field-list>[:STR] Standard deviation. (Sample st.dev, numeric fields only).", &operatorOptionHandler!StDevOperator, 221 "mode", "<field-list>[:STR] Mode. The most frequent value. (Reads all unique values into memory.)", &operatorOptionHandler!ModeOperator, 222 "mode-count", "<field-list>[:STR] Count of the most frequent value. (Reads all unique values into memory.)", &operatorOptionHandler!ModeCountOperator, 223 "unique-count", "<field-list>[:STR] Number of unique values. (Reads all unique values into memory.)", &operatorOptionHandler!UniqueCountOperator, 224 "missing-count", "<field-list>[:STR] Number of missing (empty) fields. Not affected by '--x|exclude-missing' or '--r|replace-missing'.", &operatorOptionHandler!MissingCountOperator, 225 "not-missing-count", "<field-list>[:STR] Number of filled (non-empty) fields. Not affected by '--r|replace-missing'.", &operatorOptionHandler!NotMissingCountOperator, 226 "values", "<field-list>[:STR] All the values, separated by --v|values-delimiter. (Reads all values into memory.)", &operatorOptionHandler!ValuesOperator, 227 "unique-values", "<field-list>[:STR] All the unique values, separated by --v|values-delimiter. (Reads all unique values into memory.)", &operatorOptionHandler!UniqueValuesOperator, 228 ); 229 230 if (r.helpWanted) 231 { 232 defaultGetoptPrinter(helpText, r.options); 233 return tuple(false, 0); 234 } 235 else if (helpVerbose) 236 { 237 defaultGetoptPrinter(helpTextVerbose, r.options); 238 return tuple(false, 0); 239 } 240 else if (versionWanted) 241 { 242 import tsv_utils.common.tsvutils_version; 243 writeln(tsvutilsVersionNotice("tsv-summarize")); 244 return tuple(false, 0); 245 } 246 247 consistencyValidations(); 248 derivations(); 249 } 250 catch (Exception exc) 251 { 252 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 253 return tuple(false, 1); 254 } 255 return tuple(true, 0); 256 } 257 258 /* operationOptionHandler functions are callbacks that process command line options 259 * specifying summarization operations. eg. '--max 5', '--last 3:LastEntry'. Handlers 260 * check syntactic correctness and instantiate Operator objects that do the work. This 261 * is also where 1-upped field numbers are converted to 0-based indices. 262 */ 263 private void operatorOptionHandler(OperatorClass : SingleFieldOperator)(string option, string optionVal) 264 { 265 import std.range : enumerate; 266 import std.typecons : Yes, No; 267 import tsv_utils.common.utils : parseFieldList; 268 269 auto valSplit = findSplit(optionVal, ":"); 270 271 if (valSplit[0].empty || (!valSplit[1].empty && valSplit[2].empty)) 272 { 273 throw new Exception( 274 format("Invalid option value: '--%s %s'. Expected: '--%s <field-list>' or '--%s <field>:<header>'.", 275 option, optionVal, option, option)); 276 } 277 278 try foreach (fieldNum, fieldIndex; 279 valSplit[0].to!string 280 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex).enumerate(1)) 281 { 282 auto op = new OperatorClass(fieldIndex, globalMissingPolicy); 283 284 if (!valSplit[2].empty) // Header specified 285 { 286 if (fieldNum > 1) 287 { 288 throw new Exception( 289 format("Invalid option: '--%s %s'. Cannot specify a custom header when using multiple fields.", 290 option, optionVal)); 291 } 292 else if (!op.allowCustomHeader) 293 { 294 throw new Exception( 295 format("Invalid option: '--%s %s'. Operator does not support custom headers.", 296 option, optionVal)); 297 } 298 299 op.setCustomHeader(valSplit[2].to!string); 300 } 301 302 operators.insertBack(op); 303 if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1; 304 } 305 catch (Exception exc) 306 { 307 import std.format : format; 308 exc.msg = format("[--%s] %s", option, exc.msg); 309 throw exc; 310 } 311 } 312 313 /* QuantileOperator has a different syntax and needs a custom command option handler. */ 314 private void quantileOperatorOptionHandler(string option, string optionVal) 315 { 316 import std.typecons : Yes, No; 317 import tsv_utils.common.utils : parseFieldList; 318 319 auto formatErrorMsg(string option, string optionVal) 320 { 321 return format( 322 "Invalid option value: '--%s %s'. Expected: '--%s <field-list>:<prob>[,<prob>]' or '--%s <field>:<prob>:<header>' where <prob> is a number between 0.0 and 1.0.", 323 option, optionVal, option, option); 324 } 325 326 auto split1 = findSplit(optionVal, ":"); 327 328 if (split1[0].empty || (!split1[1].empty && split1[2].empty)) 329 throw new Exception(formatErrorMsg(option, optionVal)); 330 331 auto split2 = findSplit(split1[2], ":"); 332 333 if (split2[0].empty || (!split2[1].empty && split2[2].empty)) 334 throw new Exception(formatErrorMsg(option, optionVal)); 335 336 auto fieldStr = split1[0]; 337 auto probStr = split2[0]; 338 auto header = split2[2]; 339 340 size_t[] fieldIndices; 341 double[] probs; 342 343 try foreach (fieldIndex; 344 fieldStr.to!string.parseFieldList!(size_t, Yes.convertToZeroBasedIndex)) 345 { 346 fieldIndices ~= fieldIndex; 347 } 348 catch (Exception exc) 349 { 350 import std.format : format; 351 exc.msg = format("[--%s] %s", option, exc.msg); 352 throw exc; 353 } 354 355 foreach (str; probStr.splitter(',')) 356 { 357 double p; 358 359 try p = str.to!double; 360 catch (Exception exc) 361 throw new Exception(formatErrorMsg(option, optionVal)); 362 363 if (!(p >= 0.0 && p <= 1.0)) 364 throw new Exception( 365 format("Invalid option: '--%s %s'. Probability '%g' is not in the interval [0.0,1.0].", 366 option, optionVal, p)); 367 368 probs ~= p; 369 } 370 371 if (!header.empty && (fieldIndices.length > 1 || probs.length > 1)) 372 { 373 throw new Exception( 374 format("Invalid option: '--%s %s'. Cannot specify a custom header when using multiple fields or multiple probabilities.", 375 option, optionVal)); 376 } 377 378 assert (fieldIndices.length > 0); 379 assert (probs.length > 0); 380 assert (header.empty || (fieldIndices.length == 1 && probs.length == 1)); 381 382 foreach (fieldIndex; fieldIndices) 383 { 384 foreach (p; probs) 385 { 386 auto op = new QuantileOperator(fieldIndex, globalMissingPolicy, p); 387 if (!header.empty) op.setCustomHeader(header); 388 operators.insertBack(op); 389 } 390 if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1; 391 } 392 } 393 394 private void countOptionHandler() 395 { 396 operators.insertBack(new CountOperator()); 397 } 398 399 private void countHeaderOptionHandler(string option, string optionVal) 400 { 401 auto op = new CountOperator(); 402 op.setCustomHeader(optionVal); 403 operators.insertBack(op); 404 } 405 406 /* This routine does validations not handled by processArgs. */ 407 private void consistencyValidations() 408 { 409 if (operators.empty) 410 { 411 throw new Exception("At least one summary operator is required."); 412 } 413 414 if (inputFieldDelimiter == valuesDelimiter) 415 { 416 throw new Exception("Cannot use the same character for both --d|field-delimiter and --v|values-delimiter."); 417 } 418 419 if (excludeMissing && missingValueReplacement.length != 0) 420 { 421 throw new Exception("Cannot use both '--x|exclude-missing' and '--r|replace-missing'."); 422 } 423 } 424 425 /* Post-processing derivations. */ 426 void derivations() 427 { 428 /* keyFields need to part of the endFieldIndex, which is one past the last field index. */ 429 keyFields.each!(delegate (size_t x) { if (x >= endFieldIndex) endFieldIndex = x + 1; } ); 430 431 /* Missing field policy. */ 432 globalMissingPolicy.updatePolicy(excludeMissing, missingValueReplacement); 433 } 434 } 435 436 /** tsvSummarize does the primary work of the tsv-summarize program. 437 */ 438 void tsvSummarize(TsvSummarizeOptions cmdopt, const string[] inputFiles) 439 { 440 import tsv_utils.common.utils : bufferedByLine, throwIfWindowsNewlineOnUnix; 441 442 /* Pick the Summarizer based on the number of key-fields entered. */ 443 auto summarizer = 444 (cmdopt.keyFields.length == 0) 445 ? new NoKeySummarizer!(typeof(stdout.lockingTextWriter()))( 446 cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 447 448 : (cmdopt.keyFields.length == 1) 449 ? new OneKeySummarizer!(typeof(stdout.lockingTextWriter()))( 450 cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 451 452 : new MultiKeySummarizer!(typeof(stdout.lockingTextWriter()))( 453 cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy); 454 455 /* Add the operators to the Summarizer. */ 456 summarizer.setOperators(inputRangeObject(cmdopt.operators[])); 457 458 /* Process each input file, one line at a time. */ 459 auto lineFields = new char[][](cmdopt.endFieldIndex); 460 bool headerFound = false; 461 foreach (filename; (inputFiles.length > 0) ? inputFiles : ["-"]) 462 { 463 auto inputStream = (filename == "-") ? stdin : filename.File(); 464 foreach (lineNum, line; inputStream.bufferedByLine.enumerate(1)) 465 { 466 if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, lineNum); 467 468 /* Copy the needed number of fields to the fields array. 469 * Note: The number is zero if no operator needs fields. Notably, the count 470 * operator. Used by itself, it counts the number input lines (ala 'wc -l'). 471 */ 472 if (cmdopt.endFieldIndex > 0) 473 { 474 size_t fieldIndex = 0; 475 foreach (fieldValue; line.splitter(cmdopt.inputFieldDelimiter)) 476 { 477 if (fieldIndex == cmdopt.endFieldIndex) break; 478 lineFields[fieldIndex] = fieldValue; 479 fieldIndex++; 480 } 481 482 if (fieldIndex == 0) 483 { 484 assert(cmdopt.endFieldIndex > 0); 485 assert(line.length == 0); 486 487 /* Bug work-around. Empty lines are not handled properly by splitter. 488 * - Bug: https://issues.dlang.org/show_bug.cgi?id=15735 489 * - Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030 490 * This can arise for: '$ tsv-summarize -g 1 --count'. This counts the 491 * unique values in field 1. If there's only one column, then an empty 492 * line becomes an empty string for field 1. Work-around: Point to the 493 * line. It's an empty string. 494 */ 495 lineFields[fieldIndex] = line; 496 fieldIndex++; 497 } 498 499 if (fieldIndex < cmdopt.endFieldIndex) 500 { 501 throw new Exception( 502 format("Not enough fields in line. File: %s, Line: %s", 503 (filename == "-") ? "Standard Input" : filename, lineNum)); 504 } 505 } 506 507 if (cmdopt.hasHeader && lineNum == 1) 508 { 509 if (!headerFound) 510 { 511 summarizer.processHeaderLine(lineFields); 512 headerFound = true; 513 } 514 } 515 else 516 { 517 /* Process the line. Processing will fail (throw) if a field cannot be 518 * converted to the expected type. 519 */ 520 try summarizer.processNextLine(lineFields); 521 catch (Exception exc) 522 { 523 throw new Exception( 524 format("Could not process line or field: %s\n File: %s Line: %s%s", 525 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum, 526 (lineNum == 1) ? "\n Is this a header line? Use --header to skip." : "")); 527 } 528 } 529 } 530 } 531 532 debug writeln("[tsvSummarize] After reading all data."); 533 534 /* Whew! We're done processing input data. Run the calculations and print. */ 535 auto printOptions = SummarizerPrintOptions( 536 cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision); 537 auto stdoutWriter = stdout.lockingTextWriter; 538 539 if (cmdopt.hasHeader || cmdopt.writeHeader) 540 { 541 summarizer.writeSummaryHeader(stdoutWriter, printOptions); 542 } 543 544 summarizer.writeSummaryBody(stdoutWriter, printOptions); 545 } 546 547 /** The default field header. This is used when the input doesn't have field headers, 548 * but field headers are used in the output. The default is "fieldN", where N is the 549 * 1-upped field number. 550 */ 551 string fieldHeaderFromIndex(size_t fieldIndex) 552 { 553 enum prefix = "field"; 554 return prefix ~ (fieldIndex + 1).to!string; 555 } 556 557 unittest 558 { 559 assert(fieldHeaderFromIndex(0) == "field1"); 560 assert(fieldHeaderFromIndex(10) == "field11"); 561 } 562 563 /** Produce a summary header from a field header. 564 * 565 * The result has the form `<fieldHeader>_<operation>`. e.g. If the field header is 566 * "length" and the operation is "max", the summary header is "length_max". The field 567 * header typically comes a header line in the input data or was constructed by 568 * fieldHeaderFromIndex(). 569 * 570 * If operationName is the empty string, then fieldHeader is used unchanged. This supports 571 * the Retain operator. 572 */ 573 string summaryHeaderFromFieldHeader(string fieldHeader, string operationName) 574 { 575 return (operationName.length > 0) ? fieldHeader ~ "_" ~ operationName : fieldHeader; 576 } 577 578 unittest 579 { 580 assert(summaryHeaderFromFieldHeader("originalfield", "mycalc") == "originalfield_mycalc"); 581 assert(summaryHeaderFromFieldHeader("originalfield", "") == "originalfield"); 582 } 583 584 /** SummarizerPrintOptions holds printing options for Summarizers and Calculators. Typically 585 * specified with command line options, it is separated out for modularity. 586 */ 587 struct SummarizerPrintOptions 588 { 589 char fieldDelimiter; 590 char valuesDelimiter; 591 size_t floatPrecision = 12; 592 593 import std.traits : isFloatingPoint, isIntegral; 594 595 auto formatNumber(T)(T n) const 596 if (isFloatingPoint!T || isIntegral!T) 597 { 598 import tsv_utils.common.numerics : formatNumber; 599 return formatNumber!T(n, floatPrecision); 600 } 601 } 602 603 /** A Summarizer object maintains the state of the summarization and performs basic 604 * processing. Handling of files and input lines is left to the caller. 605 * 606 * Classes supporting the Summarizer must implement the methods: 607 * - setOperators - Called after initializing the object for each operator to be processed. 608 * - processHeaderLine - Called to process the header line of each file. Returns true if 609 * it was the first header line processed (used when reading multiple files). 610 * - processNextLine - Called to process non-header lines. 611 * - writeSummaryHeader - Called to write the header line. 612 * - writeSummaryBody - Called to write the result lines. 613 * 614 */ 615 interface Summarizer(OutputRange) 616 { 617 /** Called after initializing the object for each operator to be processed. */ 618 void setOperators(InputRange!Operator op); 619 620 /** Called to process the header line of each file. Returns true if it was the 621 * first header line processed (used when reading multiple files). 622 */ 623 bool processHeaderLine(const char[][] lineFields); 624 625 /** Called to process non-header lines. */ 626 void processNextLine(const char[][] lineFields); 627 628 /** Called to write the header line. */ 629 void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions); 630 631 /** Called to write the result lines. */ 632 void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions); 633 } 634 635 /** SummarizerBase performs work shared by all sumarizers, most everything except for 636 * handling of unique keys. 637 * 638 * The base class handles creation, allocates storage for Operators and SharedFieldValues, 639 * and similar. Derived classes deal primarily with unique keys and the associated Calculators 640 * and UniqueKeyValuesLists. 641 */ 642 class SummarizerBase(OutputRange) : Summarizer!OutputRange 643 { 644 private char _inputFieldDelimiter; 645 private bool _hasProcessedFirstHeaderLine = false; 646 private SharedFieldValues _sharedFieldValues = null; // Null if no shared field value lists. 647 protected MissingFieldPolicy _missingPolicy; 648 protected DList!Operator _operators; 649 protected size_t _numOperators = 0; 650 651 this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 652 { 653 _inputFieldDelimiter = inputFieldDelimiter; 654 _missingPolicy = missingPolicy; 655 } 656 657 char inputFieldDelimiter() const @property 658 { 659 return _inputFieldDelimiter; 660 } 661 662 /** Sets the Operators used by the Summarizer. Called after construction. */ 663 void setOperators(InputRange!Operator operators) 664 { 665 foreach (op; operators) 666 { 667 _operators.insertBack(op); 668 _numOperators++; 669 auto numericFieldsToSave = op.numericFieldsToSave(); 670 auto textFieldsToSave = op.textFieldsToSave(); 671 672 if (numericFieldsToSave.length > 0 || textFieldsToSave.length > 0) 673 { 674 if (_sharedFieldValues is null) 675 { 676 _sharedFieldValues = new SharedFieldValues(); 677 } 678 numericFieldsToSave.each!(x => _sharedFieldValues.addNumericIndex(x)); 679 textFieldsToSave.each!(x => _sharedFieldValues.addTextIndex(x)); 680 } 681 } 682 } 683 684 /** Called to process the header line of each file. Returns true if it was the 685 * first header line processed (used when reading multiple files). 686 */ 687 bool processHeaderLine(const char[][] lineFields) 688 { 689 if (!_hasProcessedFirstHeaderLine) 690 { 691 _operators.each!(x => x.processHeaderLine(lineFields)); 692 _hasProcessedFirstHeaderLine = true; 693 return true; 694 } 695 else 696 { 697 return false; 698 } 699 } 700 701 protected final UniqueKeyValuesLists makeUniqueKeyValuesLists() 702 { 703 return (_sharedFieldValues is null) 704 ? null 705 : _sharedFieldValues.makeUniqueKeyValuesLists; 706 } 707 708 abstract void processNextLine(const char[][] lineFields); 709 abstract void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions); 710 abstract void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions); 711 } 712 713 /** The NoKeySummarizer is used when summarizing values across the entire input. 714 * 715 * Note: NoKeySummarizer is used in Operator unit tests and gets extensive testing 716 * through that mechanism. 717 */ 718 final class NoKeySummarizer(OutputRange) : SummarizerBase!OutputRange 719 { 720 private Calculator[] _calculators; 721 private UniqueKeyValuesLists _valueLists; 722 723 this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 724 { 725 super(inputFieldDelimiter, missingPolicy); 726 } 727 728 /** Called after initializing the object for each operator to be processed. */ 729 override void setOperators(InputRange!Operator operators) 730 { 731 super.setOperators(operators); 732 733 /* Only one Calculator per Operation, so create them as Operators are added. */ 734 foreach (op; operators) _calculators ~= op.makeCalculator; 735 _valueLists = super.makeUniqueKeyValuesLists(); 736 } 737 738 /** Called to process non-header lines. */ 739 override void processNextLine(const char[][] lineFields) 740 { 741 _calculators.each!(x => x.processNextLine(lineFields)); 742 if (_valueLists !is null) _valueLists.processNextLine(lineFields, _missingPolicy); 743 } 744 745 /** Called to write the header line. */ 746 override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 747 { 748 put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter)); 749 put(outputStream, '\n'); 750 } 751 752 /** Called to write the result lines. */ 753 override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 754 { 755 put(outputStream, 756 _calculators[] 757 .map!(x => x.calculate(_valueLists, printOptions)) 758 .join(printOptions.fieldDelimiter)); 759 put(outputStream, '\n'); 760 } 761 } 762 763 /** KeySummarizerBase does work shared by the single key and multi-key summarizers. 764 * 765 * The primary difference between those two is the formation of the key. The primary 766 * reason for separating those into two separate classes is to simplify (speed-up) 767 * handling of single field keys, which are the most common use case. 768 */ 769 class KeySummarizerBase(OutputRange) : SummarizerBase!OutputRange 770 { 771 protected struct UniqueKeyData 772 { 773 Calculator[] calculators; 774 UniqueKeyValuesLists valuesLists; 775 } 776 777 private DList!string _uniqueKeys; 778 private UniqueKeyData[string] _uniqueKeyData; 779 780 this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 781 { 782 super(inputFieldDelimiter, missingPolicy); 783 } 784 785 protected void processNextLineWithKey(T : const char[])(T key, const char[][] lineFields) 786 { 787 debug writefln("[%s]: %s", __FUNCTION__, lineFields.to!string); 788 789 auto dataPtr = (key in _uniqueKeyData); 790 auto data = (dataPtr is null) ? addUniqueKey(key.to!string) : *dataPtr; 791 792 data.calculators.each!(x => x.processNextLine(lineFields)); 793 if (data.valuesLists !is null) data.valuesLists.processNextLine(lineFields, _missingPolicy); 794 } 795 796 protected UniqueKeyData addUniqueKey(string key) 797 { 798 assert(key !in _uniqueKeyData); 799 800 _uniqueKeys.insertBack(key); 801 802 auto calculators = new Calculator[_numOperators]; 803 size_t i = 0; 804 foreach (op; _operators) 805 { 806 calculators[i] = op.makeCalculator; 807 i++; 808 } 809 810 return _uniqueKeyData[key] = UniqueKeyData(calculators, super.makeUniqueKeyValuesLists()); 811 } 812 813 override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 814 { 815 put(outputStream, keyFieldHeader()); 816 put(outputStream, printOptions.fieldDelimiter); 817 put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter)); 818 put(outputStream, '\n'); 819 } 820 821 override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 822 { 823 foreach(key; _uniqueKeys) 824 { 825 auto data = _uniqueKeyData[key]; 826 put(outputStream, key); 827 put(outputStream, printOptions.fieldDelimiter); 828 put(outputStream, 829 data.calculators[] 830 .map!(x => x.calculate(data.valuesLists, printOptions)) 831 .join(printOptions.fieldDelimiter)); 832 put(outputStream, '\n'); 833 } 834 } 835 836 abstract string keyFieldHeader() const @property; 837 } 838 839 /** This Summarizer is for the case where the unique key is based on exactly one field. 840 */ 841 final class OneKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange 842 { 843 private size_t _keyFieldIndex = 0; 844 private string _keyFieldHeader; 845 private DList!string _uniqueKeys; 846 847 this(size_t keyFieldIndex, char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 848 { 849 super(inputFieldDelimiter, missingPolicy); 850 _keyFieldIndex = keyFieldIndex; 851 _keyFieldHeader = fieldHeaderFromIndex(keyFieldIndex); 852 } 853 854 override string keyFieldHeader() const @property 855 { 856 return _keyFieldHeader; 857 } 858 859 override bool processHeaderLine(const char[][] lineFields) 860 { 861 assert(_keyFieldIndex <= lineFields.length); 862 863 bool isFirstHeaderLine = super.processHeaderLine(lineFields); 864 if (isFirstHeaderLine) 865 { 866 _keyFieldHeader = lineFields[_keyFieldIndex].to!string; 867 } 868 return isFirstHeaderLine; 869 } 870 871 override void processNextLine(const char[][] lineFields) 872 { 873 assert(_keyFieldIndex < lineFields.length); 874 processNextLineWithKey(lineFields[_keyFieldIndex], lineFields); 875 } 876 } 877 878 /** This Summarizer is for the case where the unique key is based on multiple fields. 879 */ 880 final class MultiKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange 881 { 882 private size_t[] _keyFieldIndices; 883 private string _keyFieldHeader; 884 private DList!string _uniqueKeys; 885 886 this(const size_t[] keyFieldIndices, char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 887 { 888 super(inputFieldDelimiter, missingPolicy); 889 _keyFieldIndices = keyFieldIndices.dup; 890 _keyFieldHeader = 891 _keyFieldIndices.map!(i => fieldHeaderFromIndex(i)) 892 .join(inputFieldDelimiter); 893 } 894 895 override string keyFieldHeader() const @property 896 { 897 return _keyFieldHeader; 898 } 899 900 override bool processHeaderLine(const char[][] lineFields) 901 { 902 assert(_keyFieldIndices.all!(x => x < lineFields.length)); 903 assert(_keyFieldIndices.length >= 2); 904 905 bool isFirstHeaderLine = super.processHeaderLine(lineFields); 906 if (isFirstHeaderLine) 907 { 908 _keyFieldHeader = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string; 909 } 910 return isFirstHeaderLine; 911 } 912 913 override void processNextLine(const char[][] lineFields) 914 { 915 assert(_keyFieldIndices.all!(x => x < lineFields.length)); 916 assert(_keyFieldIndices.length >= 2); 917 918 string key = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string; 919 processNextLineWithKey(key, lineFields); 920 } 921 } 922 923 version(unittest) 924 { 925 /* testSummarizer is a helper that can run many types of unit tests against 926 * Summarizers. It can also test operators, but there are separate helper functions 927 * better suited for that purpose. 928 * 929 * Arguments are a command line args, an input file, and expected output. The 930 * input file and expected output are already split into lines and fields, the helper 931 * manages re-assembly. The program name from the command line args is printed if an 932 * an error occurs, it is useful to identify the test that failed. 933 * 934 * Note: Much of this is a duplication tsvSummarize logic. Better abstraction of 935 * file input/output would enable running unit tests directly on top of tsvSummarize. 936 */ 937 void testSummarizer(string[] cmdArgs, string[][] file, string[][] expected) 938 { 939 import std.array : appender; 940 941 assert(cmdArgs.length > 0, "[testSummarizer] cmdArgs must not be empty."); 942 943 auto formatAssertMessage(T...)(string msg, T formatArgs) 944 { 945 auto formatString = "[testSummarizer] %s: " ~ msg; 946 return format(formatString, cmdArgs[0], formatArgs); 947 } 948 949 TsvSummarizeOptions cmdopt; 950 auto savedCmdArgs = cmdArgs.to!string; 951 auto r = cmdopt.processArgs(cmdArgs); 952 assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); 953 954 assert(file.all!(line => line.length >= cmdopt.endFieldIndex), 955 formatAssertMessage("group-by or operator field number greater than number of fields a line of the input file.")); 956 957 /* Pick the Summarizer based on the number of key-fields entered. */ 958 auto summarizer = 959 (cmdopt.keyFields.length == 0) 960 ? new NoKeySummarizer!(typeof(appender!(char[])()))( 961 cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 962 963 : (cmdopt.keyFields.length == 1) 964 ? new OneKeySummarizer!(typeof(appender!(char[])()))( 965 cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 966 967 : new MultiKeySummarizer!(typeof(appender!(char[])()))( 968 cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy); 969 970 /* Add the operators to the Summarizer. */ 971 summarizer.setOperators(inputRangeObject(cmdopt.operators[])); 972 973 /* Process the file one line at a time. */ 974 auto lineFields = new char[][](cmdopt.endFieldIndex); 975 bool headerFound = false; 976 foreach (lineNum, line; file.enumerate(1)) 977 { 978 /* Copy the needed fields to the fields array. */ 979 foreach (i, val; line[0..cmdopt.endFieldIndex]) lineFields[i] = val.dup; 980 981 if (cmdopt.hasHeader && lineNum == 1) 982 { 983 if (!headerFound) 984 { 985 summarizer.processHeaderLine(lineFields); 986 headerFound = true; 987 } 988 } 989 else 990 { 991 try summarizer.processNextLine(lineFields); 992 catch (Exception exc) 993 { 994 assert(false, formatAssertMessage(exc.msg)); 995 } 996 } 997 } 998 auto printOptions = SummarizerPrintOptions( 999 cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision); 1000 1001 auto summarizerOutput = appender!(char[])(); 1002 1003 if (cmdopt.hasHeader || cmdopt.writeHeader) 1004 { 1005 summarizer.writeSummaryHeader(summarizerOutput, printOptions); 1006 } 1007 1008 summarizer.writeSummaryBody(summarizerOutput, printOptions); 1009 auto expectedOutput = expected.map!(x => x.joiner(cmdopt.inputFieldDelimiter.to!string)).joiner("\n").to!string; 1010 if (expectedOutput.length > 0 && expectedOutput[$-1] != '\n') expectedOutput ~= "\n"; 1011 1012 assert(summarizerOutput.data == expectedOutput, 1013 formatAssertMessage( 1014 "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================", 1015 expectedOutput.to!string, summarizerOutput.data.to!string)); 1016 } 1017 } 1018 1019 unittest 1020 { 1021 /* Summarizer unit tests. Primarily single-key and multi-key summarizers. To a limited 1022 * extent, command line option handling (TsvSummarizeOptions). Individual operators 1023 * have separate tests, those tests test the no-key summarizer. The Values operator is 1024 * used in these tests. It engages a number of behaviors, and the results have limited 1025 * ambiguity. Using only one operator limits dependence on individual operators. 1026 */ 1027 1028 auto file1 = [["fld1", "fld2", "fld3"], 1029 ["a", "a", "3"], 1030 ["c", "a", "2b"], 1031 ["c", "bc", ""], 1032 ["a", "c", "2b"], 1033 ["", "bc", ""], 1034 ["c", "bc", "3"]]; 1035 1036 /* Single-key summarizer tests. 1037 */ 1038 testSummarizer(["unittest-sk-1", "--header", "--group-by", "1", "--values", "1"], 1039 file1, 1040 [["fld1", "fld1_values"], 1041 ["a", "a|a"], 1042 ["c", "c|c|c"], 1043 ["", ""]] 1044 ); 1045 testSummarizer(["unittest-sk-2", "-H", "--group-by", "1", "--values", "2"], 1046 file1, 1047 [["fld1", "fld2_values"], 1048 ["a", "a|c"], 1049 ["c", "a|bc|bc"], 1050 ["", "bc"]] 1051 ); 1052 testSummarizer(["unittest-sk-3", "-H", "-g", "1", "--values", "3"], 1053 file1, 1054 [["fld1", "fld3_values"], 1055 ["a", "3|2b"], 1056 ["c", "2b||3"], 1057 ["", ""]] 1058 ); 1059 testSummarizer(["unittest-sk-4", "-H", "--group-by", "1", "--values", "1,2,3"], 1060 file1, 1061 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1062 ["a", "a|a", "a|c", "3|2b"], 1063 ["c", "c|c|c", "a|bc|bc", "2b||3"], 1064 ["", "", "bc", ""]] 1065 ); 1066 testSummarizer(["unittest-sk-5", "-H", "--group-by", "1", "--values", "1-3"], 1067 file1, 1068 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1069 ["a", "a|a", "a|c", "3|2b"], 1070 ["c", "c|c|c", "a|bc|bc", "2b||3"], 1071 ["", "", "bc", ""]] 1072 ); 1073 testSummarizer(["unittest-sk-6", "-H", "--group-by", "1", "--values", "3,2,1"], 1074 file1, 1075 [["fld1", "fld3_values", "fld2_values", "fld1_values"], 1076 ["a", "3|2b", "a|c", "a|a"], 1077 ["c", "2b||3", "a|bc|bc", "c|c|c"], 1078 ["", "", "bc", ""]] 1079 ); 1080 testSummarizer(["unittest-sk-7", "-H", "--group-by", "1", "--values", "3-1"], 1081 file1, 1082 [["fld1", "fld3_values", "fld2_values", "fld1_values"], 1083 ["a", "3|2b", "a|c", "a|a"], 1084 ["c", "2b||3", "a|bc|bc", "c|c|c"], 1085 ["", "", "bc", ""]] 1086 ); 1087 testSummarizer(["unittest-sk-8", "-H", "--group-by", "2", "--values", "1"], 1088 file1, 1089 [["fld2", "fld1_values"], 1090 ["a", "a|c"], 1091 ["bc", "c||c"], 1092 ["c", "a"]] 1093 ); 1094 testSummarizer(["unittest-sk-9", "-H", "--group-by", "2", "--values", "2"], 1095 file1, 1096 [["fld2", "fld2_values"], 1097 ["a", "a|a"], 1098 ["bc", "bc|bc|bc"], 1099 ["c", "c"]] 1100 ); 1101 testSummarizer(["unittest-sk-10", "-H", "--group-by", "2", "--values", "3"], 1102 file1, 1103 [["fld2", "fld3_values"], 1104 ["a", "3|2b"], 1105 ["bc", "||3"], 1106 ["c", "2b"]] 1107 ); 1108 testSummarizer(["unittest-sk-11", "-H", "--group-by", "2", "--values", "1,3"], 1109 file1, 1110 [["fld2", "fld1_values", "fld3_values"], 1111 ["a", "a|c", "3|2b"], 1112 ["bc", "c||c", "||3"], 1113 ["c", "a", "2b"]] 1114 ); 1115 testSummarizer(["unittest-sk-12", "-H", "--group-by", "2", "--values", "3,1"], 1116 file1, 1117 [["fld2", "fld3_values", "fld1_values"], 1118 ["a", "3|2b", "a|c"], 1119 ["bc", "||3", "c||c"], 1120 ["c", "2b", "a"]] 1121 ); 1122 testSummarizer(["unittest-sk-13", "-H", "--group-by", "3", "--values", "1"], 1123 file1, 1124 [["fld3", "fld1_values"], 1125 ["3", "a|c"], 1126 ["2b", "c|a"], 1127 ["", "c|"]] 1128 ); 1129 testSummarizer(["unittest-sk-14", "-H", "--group-by", "3", "--values", "2"], 1130 file1, 1131 [["fld3", "fld2_values"], 1132 ["3", "a|bc"], 1133 ["2b", "a|c"], 1134 ["", "bc|bc"]] 1135 ); 1136 testSummarizer(["unittest-sk-15", "-H", "--group-by", "3", "--values", "1,2"], 1137 file1, 1138 [["fld3", "fld1_values", "fld2_values"], 1139 ["3", "a|c", "a|bc"], 1140 ["2b", "c|a", "a|c"], 1141 ["", "c|", "bc|bc"]] 1142 ); 1143 1144 /* Multi-key summarizer tests. 1145 */ 1146 testSummarizer(["unittest-mk-1", "--header", "--group-by", "1,2", "--values", "1"], 1147 file1, 1148 [["fld1", "fld2", "fld1_values"], 1149 ["a", "a", "a"], 1150 ["c", "a", "c"], 1151 ["c", "bc", "c|c"], 1152 ["a", "c", "a"], 1153 ["", "bc", ""]] 1154 ); 1155 testSummarizer(["unittest-mk-2", "-H", "--group-by", "1,2", "--values", "2"], 1156 file1, 1157 [["fld1", "fld2", "fld2_values"], 1158 ["a", "a", "a"], 1159 ["c", "a", "a"], 1160 ["c", "bc", "bc|bc"], 1161 ["a", "c", "c"], 1162 ["", "bc", "bc"]] 1163 ); 1164 testSummarizer(["unittest-mk-3", "-H", "--group-by", "1,2", "--values", "3"], 1165 file1, 1166 [["fld1", "fld2", "fld3_values"], 1167 ["a", "a", "3"], 1168 ["c", "a", "2b"], 1169 ["c", "bc", "|3"], 1170 ["a", "c", "2b"], 1171 ["", "bc", ""]] 1172 ); 1173 testSummarizer(["unittest-mk-4", "-H", "--group-by", "1,2", "--values", "3,1"], 1174 file1, 1175 [["fld1", "fld2", "fld3_values", "fld1_values"], 1176 ["a", "a", "3", "a"], 1177 ["c", "a", "2b", "c"], 1178 ["c", "bc", "|3", "c|c"], 1179 ["a", "c", "2b", "a"], 1180 ["", "bc", "", ""]] 1181 ); 1182 testSummarizer(["unittest-mk-5", "-H", "--group-by", "3,2", "--values", "1"], 1183 file1, 1184 [["fld3", "fld2", "fld1_values"], 1185 ["3", "a", "a"], 1186 ["2b", "a", "c"], 1187 ["", "bc", "c|"], 1188 ["2b", "c", "a"], 1189 ["3", "bc", "c"]] 1190 ); 1191 testSummarizer(["unittest-mk-6", "-H", "--group-by", "3-2", "--values", "1"], 1192 file1, 1193 [["fld3", "fld2", "fld1_values"], 1194 ["3", "a", "a"], 1195 ["2b", "a", "c"], 1196 ["", "bc", "c|"], 1197 ["2b", "c", "a"], 1198 ["3", "bc", "c"]] 1199 ); 1200 testSummarizer(["unittest-mk-7", "-H", "--group-by", "2,1,3", "--values", "2"], 1201 file1, 1202 [["fld2", "fld1", "fld3", "fld2_values"], 1203 ["a", "a", "3", "a"], 1204 ["a", "c", "2b", "a"], 1205 ["bc", "c", "", "bc"], 1206 ["c", "a", "2b", "c"], 1207 ["bc", "", "", "bc"], 1208 ["bc", "c", "3", "bc"]] 1209 ); 1210 1211 /* Missing policies. */ 1212 testSummarizer(["unittest-mis-1", "--header", "--group-by", "1", "--values", "1", "--exclude-missing"], 1213 file1, 1214 [["fld1", "fld1_values"], 1215 ["a", "a|a"], 1216 ["c", "c|c|c"], 1217 ["", ""]] 1218 ); 1219 testSummarizer(["unittest-mis-2", "-H", "--group-by", "1", "--values", "2", "-x"], 1220 file1, 1221 [["fld1", "fld2_values"], 1222 ["a", "a|c"], 1223 ["c", "a|bc|bc"], 1224 ["", "bc"]] 1225 ); 1226 testSummarizer(["unittest-mis-3", "-H", "-g", "1", "--values", "3", "-x"], 1227 file1, 1228 [["fld1", "fld3_values"], 1229 ["a", "3|2b"], 1230 ["c", "2b|3"], 1231 ["", ""]] 1232 ); 1233 testSummarizer(["unittest-mis-4", "-H", "--group-by", "1", "--values", "1,2,3", "-x"], 1234 file1, 1235 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1236 ["a", "a|a", "a|c", "3|2b"], 1237 ["c", "c|c|c", "a|bc|bc", "2b|3"], 1238 ["", "", "bc", ""]] 1239 ); 1240 testSummarizer(["unittest-mis-5", "--header", "--group-by", "1", "--values", "1", "--replace-missing", "NA"], 1241 file1, 1242 [["fld1", "fld1_values"], 1243 ["a", "a|a"], 1244 ["c", "c|c|c"], 1245 ["", "NA"]] 1246 ); 1247 testSummarizer(["unittest-mis-6", "-H", "--group-by", "1", "--values", "2", "-r", "NA"], 1248 file1, 1249 [["fld1", "fld2_values"], 1250 ["a", "a|c"], 1251 ["c", "a|bc|bc"], 1252 ["", "bc"]] 1253 ); 1254 testSummarizer(["unittest-mis-7", "-H", "-g", "1", "--values", "3", "-r", "NA"], 1255 file1, 1256 [["fld1", "fld3_values"], 1257 ["a", "3|2b"], 1258 ["c", "2b|NA|3"], 1259 ["", "NA"]] 1260 ); 1261 testSummarizer(["unittest-mis-8", "-H", "--group-by", "1", "--values", "1,2,3", "-r", "NA"], 1262 file1, 1263 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1264 ["a", "a|a", "a|c", "3|2b"], 1265 ["c", "c|c|c", "a|bc|bc", "2b|NA|3"], 1266 ["", "NA", "bc", "NA"]] 1267 ); 1268 testSummarizer(["unittest-mis-9", "-H", "--group-by", "1,2", "--values", "3,1", "-x"], 1269 file1, 1270 [["fld1", "fld2", "fld3_values", "fld1_values"], 1271 ["a", "a", "3", "a"], 1272 ["c", "a", "2b", "c"], 1273 ["c", "bc", "3", "c|c"], 1274 ["a", "c", "2b", "a"], 1275 ["", "bc", "", ""]] 1276 ); 1277 testSummarizer(["unittest-mis-10", "-H", "--group-by", "3,2", "--values", "1", "-x"], 1278 file1, 1279 [["fld3", "fld2", "fld1_values"], 1280 ["3", "a", "a"], 1281 ["2b", "a", "c"], 1282 ["", "bc", "c"], 1283 ["2b", "c", "a"], 1284 ["3", "bc", "c"]] 1285 ); 1286 testSummarizer(["unittest-mis-11", "-H", "--group-by", "2,1,3", "--values", "2", "-x"], 1287 file1, 1288 [["fld2", "fld1", "fld3", "fld2_values"], 1289 ["a", "a", "3", "a"], 1290 ["a", "c", "2b", "a"], 1291 ["bc", "c", "", "bc"], 1292 ["c", "a", "2b", "c"], 1293 ["bc", "", "", "bc"], 1294 ["bc", "c", "3", "bc"]] 1295 ); 1296 testSummarizer(["unittest-mis-12", "-H", "--group-by", "1,2", "--values", "3,1", "-r", "NA"], 1297 file1, 1298 [["fld1", "fld2", "fld3_values", "fld1_values"], 1299 ["a", "a", "3", "a"], 1300 ["c", "a", "2b", "c"], 1301 ["c", "bc", "NA|3", "c|c"], 1302 ["a", "c", "2b", "a"], 1303 ["", "bc", "NA", "NA"]] 1304 ); 1305 testSummarizer(["unittest-mis-13", "-H", "--group-by", "3,2", "--values", "1", "-r", "NA"], 1306 file1, 1307 [["fld3", "fld2", "fld1_values"], 1308 ["3", "a", "a"], 1309 ["2b", "a", "c"], 1310 ["", "bc", "c|NA"], 1311 ["2b", "c", "a"], 1312 ["3", "bc", "c"]] 1313 ); 1314 testSummarizer(["unittest-mis-14", "-H", "--group-by", "2,1,3", "--values", "2", "-r", "NA"], 1315 file1, 1316 [["fld2", "fld1", "fld3", "fld2_values"], 1317 ["a", "a", "3", "a"], 1318 ["a", "c", "2b", "a"], 1319 ["bc", "c", "", "bc"], 1320 ["c", "a", "2b", "c"], 1321 ["bc", "", "", "bc"], 1322 ["bc", "c", "3", "bc"]] 1323 ); 1324 1325 /* Validate that the no-key summarizer works with testSummarizer helper function. 1326 */ 1327 testSummarizer(["unittest-nk-1", "-H", "--values", "1,2"], 1328 file1, 1329 [["fld1_values", "fld2_values"], 1330 ["a|c|c|a||c", "a|a|bc|c|bc|bc"]] 1331 ); 1332 1333 /* Header variations: no header line; auto-generated header line; custom headers. 1334 */ 1335 testSummarizer(["unittest-hdr-1", "--group-by", "1", "--values", "1"], 1336 file1[1..$], 1337 [["a", "a|a"], 1338 ["c", "c|c|c"], 1339 ["", ""]] 1340 ); 1341 testSummarizer(["unittest-hdr-2", "--group-by", "1,2", "--values", "2"], 1342 file1[1..$], 1343 [["a", "a", "a"], 1344 ["c", "a", "a"], 1345 ["c", "bc", "bc|bc"], 1346 ["a", "c", "c"], 1347 ["", "bc", "bc"]] 1348 ); 1349 testSummarizer(["unittest-hdr-3", "--write-header", "--group-by", "2", "--values", "1"], 1350 file1[1..$], 1351 [["field2", "field1_values"], 1352 ["a", "a|c"], 1353 ["bc", "c||c"], 1354 ["c", "a"]] 1355 ); 1356 testSummarizer(["unittest-hdr-4", "-w", "--group-by", "3,2", "--values", "1"], 1357 file1[1..$], 1358 [["field3", "field2", "field1_values"], 1359 ["3", "a", "a"], 1360 ["2b", "a", "c"], 1361 ["", "bc", "c|"], 1362 ["2b", "c", "a"], 1363 ["3", "bc", "c"]] 1364 ); 1365 testSummarizer(["unittest-hdr-5", "-H", "--group-by", "2", "--values", "3:Field3Values"], 1366 file1, 1367 [["fld2", "Field3Values"], 1368 ["a", "3|2b"], 1369 ["bc", "||3"], 1370 ["c", "2b"]] 1371 ); 1372 testSummarizer(["unittest-hdr-6", "-H", "--group-by", "1,2", "--values", "3:FieldThreeValues", "--values", "1:FieldOneValues"], 1373 file1, 1374 [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"], 1375 ["a", "a", "3", "a"], 1376 ["c", "a", "2b", "c"], 1377 ["c", "bc", "|3", "c|c"], 1378 ["a", "c", "2b", "a"], 1379 ["", "bc", "", ""]] 1380 ); 1381 testSummarizer(["unittest-hdr-7", "--write-header", "--group-by", "1", "--values", "3:f3_vals","--values", "2:f2_vals", "--values", "1:f1_vals"], 1382 file1[1..$], 1383 [["field1", "f3_vals", "f2_vals", "f1_vals"], 1384 ["a", "3|2b", "a|c", "a|a"], 1385 ["c", "2b||3", "a|bc|bc", "c|c|c"], 1386 ["", "", "bc", ""]] 1387 ); 1388 testSummarizer(["unittest-hdr-8", "--write-header", "--group-by", "1,3,2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2"], 1389 file1[1..$], 1390 [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"], 1391 ["a", "3", "a", "3", "a", "a"], 1392 ["c", "2b", "a", "2b", "c", "a"], 1393 ["c", "", "bc", "", "c", "bc"], 1394 ["a", "2b", "c", "2b", "a", "c"], 1395 ["", "", "bc", "", "", "bc"], 1396 ["c", "3", "bc", "3", "c", "bc"]] 1397 ); 1398 testSummarizer(["unittest-hdr-9", "--write-header", "--group-by", "1,3-2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2"], 1399 file1[1..$], 1400 [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"], 1401 ["a", "3", "a", "3", "a", "a"], 1402 ["c", "2b", "a", "2b", "c", "a"], 1403 ["c", "", "bc", "", "c", "bc"], 1404 ["a", "2b", "c", "2b", "a", "c"], 1405 ["", "", "bc", "", "", "bc"], 1406 ["c", "3", "bc", "3", "c", "bc"]] 1407 ); 1408 1409 /* Alternate file widths and lengths. 1410 */ 1411 1412 auto file3x2 = [["fld1", "fld2", "fld3"], 1413 ["a", "b", "c"], 1414 ["c", "b", "a"]]; 1415 1416 testSummarizer(["unittest-3x2-1", "-H", "--group-by", "1", "--values", "3"], 1417 file3x2, 1418 [["fld1", "fld3_values"], 1419 ["a", "c"], 1420 ["c", "a"]] 1421 ); 1422 testSummarizer(["unittest-3x2-2", "-H", "--group-by", "2", "--values", "3"], 1423 file3x2, 1424 [["fld2", "fld3_values"], 1425 ["b", "c|a"]] 1426 ); 1427 testSummarizer(["unittest-3x2-3", "-H", "--group-by", "2,1", "--values", "3"], 1428 file3x2, 1429 [["fld2", "fld1", "fld3_values"], 1430 ["b", "a", "c"], 1431 ["b", "c", "a"]] 1432 ); 1433 1434 auto file3x1 = [["fld1", "fld2", "fld3"], 1435 ["a", "b", "c"]]; 1436 1437 testSummarizer(["unittest-3x1-1", "-H", "--group-by", "1", "--values", "3"], 1438 file3x1, 1439 [["fld1", "fld3_values"], 1440 ["a", "c"]] 1441 ); 1442 testSummarizer(["unittest-3x1-2", "--group-by", "1", "--values", "3"], 1443 file3x1[1..$], 1444 [["a", "c"]] 1445 ); 1446 testSummarizer(["unittest-3x1-3", "-H", "--group-by", "2,1", "--values", "3"], 1447 file3x1, 1448 [["fld2", "fld1", "fld3_values"], 1449 ["b", "a", "c"]] 1450 ); 1451 testSummarizer(["unittest-3x1-4", "--group-by", "2,1", "--values", "3"], 1452 file3x1[1..$], 1453 [["b", "a", "c"]] 1454 ); 1455 1456 auto file3x0 = [["fld1", "fld2", "fld3"]]; 1457 1458 testSummarizer(["unittest-3x0-1", "-H", "--group-by", "1", "--values", "3"], 1459 file3x0, 1460 [["fld1", "fld3_values"]] 1461 ); 1462 testSummarizer(["unittest-3x0-2", "--group-by", "1", "--values", "3"], 1463 file3x0[1..$], 1464 [] 1465 ); 1466 testSummarizer(["unittest-3x0-3", "--write-header", "--group-by", "1", "--values", "3"], 1467 file3x0[1..$], 1468 [["field1", "field3_values"]] 1469 ); 1470 1471 1472 testSummarizer(["unittest-3x0-4", "-H", "--group-by", "2,1", "--values", "3"], 1473 file3x0, 1474 [["fld2", "fld1", "fld3_values"]] 1475 ); 1476 1477 testSummarizer(["unittest-3x0-5", "--group-by", "2,1", "--values", "3"], 1478 file3x0[1..$], 1479 [] 1480 ); 1481 1482 testSummarizer(["unittest-3x0-6", "--write-header", "--group-by", "2,1", "--values", "3"], 1483 file3x0[1..$], 1484 [["field2", "field1", "field3_values"]] 1485 ); 1486 1487 auto file2x1 = [["fld1", "fld2"], 1488 ["a", "b"]]; 1489 1490 testSummarizer(["unittest-2x1-1", "-H", "--group-by", "1", "--values", "2"], 1491 file2x1, 1492 [["fld1", "fld2_values"], 1493 ["a", "b"]] 1494 ); 1495 testSummarizer(["unittest-2x1-2", "-H", "--group-by", "2,1", "--values", "1"], 1496 file2x1, 1497 [["fld2", "fld1", "fld1_values"], 1498 ["b", "a", "a"]] 1499 ); 1500 1501 auto file2x0 = [["fld1", "fld2"]]; 1502 1503 testSummarizer(["unittest-2x0-1", "-H", "--group-by", "1", "--values", "2"], 1504 file2x0, 1505 [["fld1", "fld2_values"]] 1506 ); 1507 testSummarizer(["unittest-2x0-2", "-H", "--group-by", "2,1", "--values", "1"], 1508 file2x0, 1509 [["fld2", "fld1", "fld1_values"]] 1510 ); 1511 1512 auto file1x2 = [["fld1"], 1513 ["a"], 1514 [""]]; 1515 1516 testSummarizer(["unittest-1x2-1", "-H", "--group-by", "1", "--values", "1"], 1517 file1x2, 1518 [["fld1", "fld1_values"], 1519 ["a", "a"], 1520 ["", ""]] 1521 ); 1522 1523 auto file1x2b = [["fld1"], 1524 [""], 1525 [""]]; 1526 1527 testSummarizer(["unittest-1x2b-2", "-H", "--group-by", "1", "--values", "1"], 1528 file1x2b, 1529 [["fld1", "fld1_values"], 1530 ["", "|"]] 1531 ); 1532 1533 auto file1x1 = [["fld1"], 1534 ["x"]]; 1535 1536 testSummarizer(["unittest-1x1-1", "-H", "--group-by", "1", "--values", "1"], 1537 file1x1, 1538 [["fld1", "fld1_values"], 1539 ["x", "x"]] 1540 ); 1541 1542 testSummarizer(["unittest-1x1-2", "--group-by", "1", "--values", "1"], 1543 file1x1[1..$], 1544 [["x", "x"]] 1545 ); 1546 1547 testSummarizer(["unittest-1x1-3", "--write-header", "--group-by", "1", "--values", "1"], 1548 file1x1[1..$], 1549 [["field1", "field1_values"], 1550 ["x", "x"]] 1551 ); 1552 1553 auto file1x1b = [["fld1"], 1554 [""]]; 1555 1556 testSummarizer(["unittest-1x1b-1", "-H", "--group-by", "1", "--values", "1"], 1557 file1x1b, 1558 [["fld1", "fld1_values"], 1559 ["", ""]] 1560 ); 1561 1562 auto file1x0 = [["fld1"]]; 1563 1564 testSummarizer(["unittest-1x0-1", "-H", "--group-by", "1", "--values", "1"], 1565 file1x0, 1566 [["fld1", "fld1_values"]] 1567 ); 1568 1569 testSummarizer(["unittest-1x0-2", "--group-by", "1", "--values", "1"], 1570 file1x0[1..$], 1571 [] 1572 ); 1573 1574 testSummarizer(["unittest-1x0-3", "--write-header", "--group-by", "1", "--values", "1"], 1575 file1x0[1..$], 1576 [["field1", "field1_values"]] 1577 ); 1578 1579 /* Alternate delimiters. */ 1580 testSummarizer(["unittest-delim-1", "-H", "--values", "1,2", "--delimiter", "%"], 1581 file1, 1582 [["fld1_values", "fld2_values"], 1583 ["a|c|c|a||c", "a|a|bc|c|bc|bc"]] 1584 ); 1585 testSummarizer(["unittest-delim-2", "-H", "--values", "1-2", "--values-delimiter", "$"], 1586 file1, 1587 [["fld1_values", "fld2_values"], 1588 ["a$c$c$a$$c", "a$a$bc$c$bc$bc"]] 1589 ); 1590 testSummarizer(["unittest-delim-3", "-H", "--values", "1,2", "--delimiter", "#", "--values-delimiter", ","], 1591 file1, 1592 [["fld1_values", "fld2_values"], 1593 ["a,c,c,a,,c", "a,a,bc,c,bc,bc"]] 1594 ); 1595 testSummarizer(["unittest-delim-4", "--write-header", "--group-by", "2", "--values", "1", 1596 "--delimiter", "^", "--values-delimiter", ":"], 1597 file1[1..$], 1598 [["field2", "field1_values"], 1599 ["a", "a:c"], 1600 ["bc", "c::c"], 1601 ["c", "a"]] 1602 ); 1603 testSummarizer(["unittest-delim-5", "--group-by", "1,2", "--values", "2", "--delimiter", "/", 1604 "--values-delimiter", "\\"], 1605 file1[1..$], 1606 [["a", "a", "a"], 1607 ["c", "a", "a"], 1608 ["c", "bc", "bc\\bc"], 1609 ["a", "c", "c"], 1610 ["", "bc", "bc"]] 1611 ); 1612 } 1613 1614 /* Summary Operators and Calculators 1615 * 1616 * Two types of objects are used in implementation: Operators and Calculators. An Operator 1617 * represents a summary calculation specified on the command line, e.g. '--mean 5'. A 1618 * Calculator is used to manage the summary calculation for each unique key in the input. 1619 * 1620 * As an example, consider the command: 1621 * 1622 * $tsv-summarize --group-by 1 --mean 3 --mean 5 1623 * 1624 * This command will create two instances of a MeanOperator, one each for fields 3 and 5. 1625 * They produce the output field headers (e.g. "field3_mean", "field5_mean"). They also 1626 * create MeanCalculator objects for each unique value in field 1. For 'mean', a 1627 * calculator needs to track occurrence count and sum. Calculators produce the final 1628 * value when all processing is finished. 1629 * 1630 * Summary field headers 1631 * 1632 * There are several options for specifying summary field headers. The defaults combine the 1633 * operator name and the header of the field summarized. The defaults can be overridden on 1634 * on the command line. These scenarios are supported via the operator constructor and the 1635 * processHeaderLine() method. 1636 * 1637 * Missing field policy 1638 * 1639 * At present, tsv-summarize has a single policy for handling missing values that applies 1640 * to all operators. However, it is logically operator specific and is implemented that 1641 * way. The MissingFieldPolicy struct describes the policy, each operator contains one. 1642 * Calculators access thier operator's policy struct. 1643 */ 1644 1645 /** An Operator represents a summary calculation specified on the command line. 1646 * e.g. '--mean 5'. 1647 */ 1648 interface Operator 1649 { 1650 @property string header(); 1651 @property string name(); 1652 void processHeaderLine(const char[][] fields); 1653 size_t[] numericFieldsToSave(); // Numeric fields this Operator needs saved 1654 size_t[] textFieldsToSave(); // Text fields this Operator needs saved 1655 Calculator makeCalculator(); 1656 } 1657 1658 /** Calculators are responsible for the calculation of a single computation. They 1659 * process each line and produce the final value when all processing is finished. 1660 */ 1661 interface Calculator 1662 { 1663 void processNextLine(const char[][] fields); 1664 string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions); 1665 } 1666 1667 /** This class describes processing behavior when a missing value is encountered. 1668 */ 1669 final class MissingFieldPolicy 1670 { 1671 private bool _useMissing = true; // True if missing values are processed unchanged. 1672 private bool _replaceMissing = false; // True if missing values are replaced. 1673 private string _missingReplacement; // Replacement string if replaceMissing is true. 1674 1675 this (const bool excludeMissing = false, string missingReplacement = "") 1676 { 1677 updatePolicy(excludeMissing, missingReplacement); 1678 } 1679 1680 void updatePolicy(const bool excludeMissing, string missingReplacement) 1681 { 1682 _missingReplacement = missingReplacement; 1683 _replaceMissing = missingReplacement.length != 0; 1684 _useMissing = !excludeMissing && !replaceMissing; 1685 } 1686 1687 final bool isMissingField(const char[] field) const 1688 { 1689 return field.length == 0; 1690 } 1691 1692 final bool useMissing() const @property 1693 { 1694 return _useMissing; 1695 } 1696 1697 final bool excludeMissing() const @property 1698 { 1699 return !_useMissing && !_replaceMissing; 1700 } 1701 1702 final bool replaceMissing() const @property 1703 { 1704 return _replaceMissing; 1705 } 1706 1707 final string missingReplacement() const @property 1708 { 1709 return _missingReplacement; 1710 } 1711 } 1712 1713 /* The SharedFieldValues and UniqueKeyValuesLists classes manage lists of values collected 1714 * while reading data. Operations like median collect all values and operate on them when 1715 * running the final calculation. Value lists are needed for each unique key. A command 1716 * using multiple Operators may save multiple fields. And, different Operators may be run 1717 * against the same field. 1718 * 1719 * The last part motivates these classes. Handling large data sets necessitates minimizing 1720 * in-memory storage, making it desirable to share identical lists between Calculators. 1721 * Otherwise, each Calculator could implement its own storage, which would be simpler. 1722 * 1723 * The setup works as follows: 1724 * - Operators advertise fields they need saved ([text|numeric]FieldsToSave methods). 1725 * - The SummarizerBase object keeps a SharedFieldValues object, which in turn keeps list 1726 * of the fields advertised by Operators as needing sharing. This list gets created 1727 * during command initialization (SummarizerBase.setOperators). 1728 * - The SharedFieldValues object is used to create a UniqueKeyValuesLists object every 1729 * time a new unique key is found, in parellel to the Calculator objects created for the 1730 * key. The UniqueKeyValuesLists objects are managed by derived Summarizer classes. 1731 * - A unique key's UniqueKeyValuesLists object is passed each input line, same as 1732 * Calculators, saving the values. 1733 * - Calculators retrieve the saved values during the calculation phase. The calculator's 1734 * ProcessNextField method is typically a no-op. 1735 * - Calculators cannot make assumptions about the order of the saved values. This is 1736 * pragmatic concession to median and quantile calculations, which need to sort the data, 1737 * at least partially. Rather than generate sorted copies, the current algorithms 1738 * sort the data in place. 1739 * 1740 * One concession to duplicate storage is that text and numeric versions of the same 1741 * field might be stored. The reason is because it's important to convert text to numbers 1742 * as they are read so that useful error messages can be generated. And, storing both 1743 * forms of the same field should be less common. 1744 * 1745 * The current implementation uses the same missing values policy for all fields. If 1746 * multiple policies become supported this will need to change. 1747 * 1748 * Built-in calculations - UniqueKeyValueLists have a built-in median operation. This is 1749 * to avoid repeated calculations of the median by different calculations. 1750 */ 1751 1752 final class SharedFieldValues 1753 { 1754 // Arrays with field indices that need to be saved. 1755 private size_t[] _numericFieldIndices; 1756 private size_t[] _textFieldIndices; 1757 1758 /* Called during summarizer setup to add a shared field value for a specific field index. 1759 * eg. '--median 7' will add end up calling addNumericIdex(6), 6 being the zero-based index. 1760 * A specific index is only added once. 1761 */ 1762 final void addNumericIndex (size_t index) 1763 { 1764 if (!canFind(_numericFieldIndices, index)) _numericFieldIndices ~= index; 1765 } 1766 1767 /* Similar to addNumericIndex, except adds a text index. */ 1768 final void addTextIndex (size_t index) 1769 { 1770 if (!canFind(_textFieldIndices, index)) _textFieldIndices ~= index; 1771 } 1772 1773 /* Called every time a new key is found, or once at the beginning of the program if no keys 1774 * are being used (entire column summarized). 1775 */ 1776 final UniqueKeyValuesLists makeUniqueKeyValuesLists() 1777 { 1778 return new UniqueKeyValuesLists(_numericFieldIndices, _textFieldIndices); 1779 } 1780 } 1781 1782 final class UniqueKeyValuesLists 1783 { 1784 /* A FieldValues object holds is a list of values collect for a specific field. A 1785 * unique key may hold several. For example, the command: 1786 * $ tsv-summarize --k 1 --median 4 -- median 5 1787 * requires keeping lists for both fields 4 and 5. This in turn will result in a 1788 * _numericFieldValues being a 2 element array, one with a list of field 4 values, 1789 * the second of field 5 values. Linear search is used to find a specific field. 1790 */ 1791 private FieldValues!double[] _numericFieldValues; 1792 private FieldValues!string[] _textFieldValues; 1793 private double[] _numericFieldMedians; 1794 1795 /* The UniqueKeyValuesLists constructor takes arrays of field indices to be saved. */ 1796 this(const size_t[] numericFieldIndices, const size_t[] textFieldIndices) 1797 { 1798 if (numericFieldIndices.length > 0) 1799 { 1800 _numericFieldValues = new FieldValues!double[](numericFieldIndices.length); 1801 foreach (i, fieldIndex; numericFieldIndices) 1802 _numericFieldValues[i] = new FieldValues!double(fieldIndex); 1803 } 1804 1805 if (textFieldIndices.length > 0) 1806 { 1807 _textFieldValues = new FieldValues!string[](textFieldIndices.length); 1808 foreach (i, fieldIndex; textFieldIndices) 1809 _textFieldValues[i] = new FieldValues!string(fieldIndex); 1810 } 1811 } 1812 1813 void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy) 1814 { 1815 _numericFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy)); 1816 _textFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy)); 1817 } 1818 1819 private FieldValues!double findNumericFieldValues(size_t index) 1820 { 1821 alias pred = (FieldValues!double a, size_t b) => (a.fieldIndex == b); 1822 auto r = find!pred(_numericFieldValues, index); 1823 assert(!r.empty); 1824 return r.front; 1825 } 1826 1827 private FieldValues!string findTextFieldValues(size_t index) 1828 { 1829 alias pred = (FieldValues!string a, size_t b) => (a.fieldIndex == b); 1830 auto r = find!pred(_textFieldValues, index); 1831 assert(!r.empty); 1832 return r.front; 1833 } 1834 1835 final double[] numericValues(size_t index) 1836 { 1837 return findNumericFieldValues(index).getArray; 1838 } 1839 1840 final double[] numericValuesSorted(size_t index) 1841 { 1842 return findNumericFieldValues(index).getSortedArray; 1843 } 1844 1845 final string[] textValues(size_t index) 1846 { 1847 return findTextFieldValues(index).getArray; 1848 } 1849 1850 final string[] textValuesSorted(size_t index) 1851 { 1852 return findTextFieldValues(index).getSortedArray; 1853 } 1854 1855 final double numericValuesMedian(size_t index) 1856 { 1857 return findNumericFieldValues(index).median; 1858 } 1859 1860 private final class FieldValues(ValueType) 1861 { 1862 import std.array : appender; 1863 private size_t _fieldIndex; 1864 private Appender!(ValueType[]) _values; 1865 private bool _haveMedian = false; 1866 private bool _isSorted = false; 1867 private ValueType _medianValue; 1868 1869 this(size_t fieldIndex) 1870 { 1871 _fieldIndex = fieldIndex; 1872 } 1873 1874 final size_t length() const @property 1875 { 1876 return _values.data.length; 1877 } 1878 1879 final size_t fieldIndex() const @property 1880 { 1881 return _fieldIndex; 1882 } 1883 1884 final void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy) 1885 { 1886 debug writefln("[%s]: %s", __FUNCTION__, fields.to!string); 1887 1888 const char[] field = fields[_fieldIndex]; 1889 if (missingPolicy.useMissing || !missingPolicy.isMissingField(field)) 1890 { 1891 _values.put(field.to!ValueType); 1892 _haveMedian = false; 1893 _isSorted = false; 1894 } 1895 else if (missingPolicy.replaceMissing) 1896 { 1897 _values.put(missingPolicy.missingReplacement.to!ValueType); 1898 _haveMedian = false; 1899 _isSorted = false; 1900 } 1901 } 1902 1903 /* Return an input range of the values. */ 1904 final auto values() 1905 { 1906 return _values.data; 1907 } 1908 1909 final ValueType[] getArray() 1910 { 1911 return _values.data; 1912 } 1913 1914 final ValueType[] getSortedArray() 1915 { 1916 if (!_isSorted) 1917 { 1918 import std.algorithm : sort; 1919 sort(_values.data); 1920 _isSorted = true; 1921 } 1922 return _values.data; 1923 } 1924 1925 final ValueType median() 1926 { 1927 if (!_haveMedian) 1928 { 1929 import tsv_utils.common.numerics : rangeMedian; 1930 _medianValue = _values.data.rangeMedian(); 1931 _haveMedian = true; 1932 } 1933 1934 return _medianValue; 1935 } 1936 } 1937 } 1938 1939 /** SingleFieldOperator is a base class for single field operators, the most common 1940 * Operator. Derived classes implement makeCalculator and the Calculator class it returns. 1941 */ 1942 class SingleFieldOperator : Operator 1943 { 1944 import std.typecons : Flag; 1945 1946 private string _name; 1947 private string _header; 1948 private size_t _fieldIndex; 1949 private bool _useHeaderSuffix; 1950 private bool _allowCustomHeader; 1951 private bool _hasCustomHeader = false; 1952 private size_t[] _numericFieldsToSave; 1953 private size_t[] _textFieldsToSave; 1954 private MissingFieldPolicy _missingPolicy; 1955 1956 this(string operatorName, size_t fieldIndex, MissingFieldPolicy missingPolicy, 1957 Flag!"useHeaderSuffix" useHeaderSuffix = Yes.useHeaderSuffix, 1958 Flag!"allowCustomHeader" allowCustomHeader = Yes.allowCustomHeader) 1959 { 1960 _name = operatorName; 1961 _fieldIndex = fieldIndex; 1962 _missingPolicy = missingPolicy; 1963 _useHeaderSuffix = useHeaderSuffix; 1964 _allowCustomHeader = allowCustomHeader; 1965 // Default header. May be overrridden by custom header or header line. 1966 _header = 1967 fieldHeaderFromIndex(fieldIndex) 1968 .summaryHeaderFromFieldHeader(_useHeaderSuffix ? operatorName : ""); 1969 } 1970 1971 void setCustomHeader (string customHeader) 1972 { 1973 assert(_allowCustomHeader); 1974 _header = customHeader; 1975 _hasCustomHeader = true; 1976 } 1977 1978 final string name() const @property 1979 { 1980 return _name; 1981 } 1982 1983 final bool allowCustomHeader() const @property 1984 { 1985 return _allowCustomHeader; 1986 } 1987 1988 /* saveFieldValues[Numeric|Text] are called by derived classes to indicate that field 1989 * that the field values should be saved. These should called during construction. 1990 */ 1991 final void setSaveFieldValuesNumeric() 1992 { 1993 _numericFieldsToSave ~= _fieldIndex; 1994 } 1995 1996 final void setSaveFieldValuesText() 1997 { 1998 _textFieldsToSave ~= _fieldIndex; 1999 } 2000 2001 final MissingFieldPolicy missingPolicy() @property 2002 { 2003 return _missingPolicy; 2004 } 2005 2006 final size_t fieldIndex() const @property 2007 { 2008 return _fieldIndex; 2009 } 2010 2011 final string header() const @property 2012 { 2013 return _header; 2014 } 2015 2016 final bool useHeaderSuffix() const @property 2017 { 2018 return _useHeaderSuffix; 2019 } 2020 2021 void processHeaderLine(const char[][] fields) 2022 { 2023 if (!_hasCustomHeader) { 2024 debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string); 2025 _header = summaryHeaderFromFieldHeader(fields[_fieldIndex].to!string, 2026 _useHeaderSuffix ? _name : ""); 2027 } 2028 } 2029 2030 final size_t[] numericFieldsToSave() 2031 { 2032 return _numericFieldsToSave; 2033 } 2034 2035 final size_t[] textFieldsToSave() 2036 { 2037 return _textFieldsToSave; 2038 } 2039 2040 abstract SingleFieldCalculator makeCalculator(); 2041 } 2042 2043 /** SingleFieldCalculator is a base class for the common case of calculators using a single 2044 * field. Derived classes implement processNextField() rather than processNextLine(). 2045 */ 2046 class SingleFieldCalculator : Calculator 2047 { 2048 private size_t _fieldIndex; 2049 2050 this(size_t fieldIndex) 2051 { 2052 _fieldIndex = fieldIndex; 2053 } 2054 2055 final size_t fieldIndex() const @property 2056 { 2057 return _fieldIndex; 2058 } 2059 2060 final void processNextLine(const char[][] fields) 2061 { 2062 debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string); 2063 2064 auto missingPolicy = getOperator.missingPolicy; 2065 const char[] field = fields[_fieldIndex]; 2066 2067 if (missingPolicy.useMissing || !missingPolicy.isMissingField(field)) 2068 { 2069 processNextField(field); 2070 } 2071 else if (missingPolicy.replaceMissing) 2072 { 2073 processNextField(missingPolicy.missingReplacement); 2074 } 2075 } 2076 2077 abstract SingleFieldOperator getOperator(); 2078 2079 abstract void processNextField(const char[] field); 2080 } 2081 2082 /* Unittest helper functions. Only compiled when -unittest is in effect. */ 2083 version(unittest) 2084 { 2085 /** A helper for SingleFieldOperator unit tests. 2086 * 2087 * testSingleFieldOperator takes a set of split file values, a field index, a header 2088 * suffix, and a set of expected values. The expected values array contains the 2089 * initial value (zero entries) and the expected values after each line. (One more 2090 * expected value than input lines.) The zero entry case is what is generated for an 2091 * empty file. An example testing the 'min' operator against a file with 2 columns, 2092 * 3 rows, using field index 1: 2093 * 2094 * testSingleFieldOperator!MinOperator( 2095 * [["10", "100"], // The split file. 3 lines by 2 rows. 2096 * ["5", "50"], 2097 * ["20", "200"]], 2098 * 1, // Field index (zero-based, so "100", "50", "200") 2099 * "min", // The header suffix, normally the operator name. 2100 * ["nan", "100", "50", "50"]); // Min value after processing each line. 2101 * 2102 * A typical operator unit test uses three "files", one each of 1x3, 2x3, and 3x3. 2103 * Then run the operator is tested against each column, a total of six calls. Headers 2104 * are automatically checked. Additional entries can be used to extend coverage. 2105 * 2106 * A non-default MissingFieldPolicy can be provide as an optional last argument. 2107 * Operator tests should include exclusion and replacement variations. See operator 2108 * unit tests for details. 2109 * 2110 * The testSingleFieldOperatorBase adds an additional capability - Custom operator 2111 * init arguments. Currently this is used only by the quantile operator. 2112 * 2113 * These tests do not check unique key behavior (group-by). Operators don't have info 2114 * about unique keys, and interact with them only indirectly, via Calculators. 2115 */ 2116 void testSingleFieldOperator(OperatorClass : SingleFieldOperator) 2117 (const char[][][] splitFile, size_t fieldIndex, string headerSuffix, 2118 const char[][] expectedValues, 2119 MissingFieldPolicy missingPolicy = new MissingFieldPolicy) 2120 { 2121 testSingleFieldOperatorBase!OperatorClass(splitFile, fieldIndex, headerSuffix, expectedValues, missingPolicy); 2122 } 2123 2124 void testSingleFieldOperatorBase(OperatorClass : SingleFieldOperator, T...) 2125 (const char[][][] splitFile, size_t fieldIndex, string headerSuffix, 2126 const char[][] expectedValues, 2127 MissingFieldPolicy missingPolicy, 2128 T extraOpInitArgs) 2129 { 2130 import std.format : format; 2131 import std.array : appender; 2132 import std.string : chomp; 2133 import std.traits : EnumMembers; 2134 2135 auto numFields = (splitFile[0]).length; 2136 2137 assert(fieldIndex < numFields, 2138 format("[testSingleFieldOperator] Invalid field index. headerSuffix: %s", 2139 headerSuffix)); 2140 assert(splitFile.length + 1 == expectedValues.length, 2141 format("[testSingleFieldOperator] Need one more expected value than number of rows. headerSuffix: %s", 2142 headerSuffix)); 2143 2144 /* printOptions - Only the 'values-delimiter' (2nd arg) is used these tests. */ 2145 auto printOptions = SummarizerPrintOptions('#', '|'); 2146 2147 /* An input header line. */ 2148 string[] inputHeaderLine = new string[numFields]; 2149 foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string; 2150 2151 /* The different expected output field headers. */ 2152 auto outputFieldHeaderWithNoHeaderLine = 2153 fieldHeaderFromIndex(fieldIndex) 2154 .summaryHeaderFromFieldHeader(headerSuffix); 2155 auto outputFieldHeaderFromHeaderLine = 2156 inputHeaderLine[fieldIndex] 2157 .summaryHeaderFromFieldHeader(headerSuffix); 2158 auto customOutputFieldHeader = "custom"; 2159 2160 enum HeaderUsecase { 2161 HeaderLine_DefaultHeader, 2162 HeaderLine_CustomHeader, 2163 NoHeaderLine_DefaultHeader, 2164 NoHeaderLine_CustomHeader, 2165 NoHeaderLine_NoOutputHeader, 2166 } 2167 2168 string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected) 2169 { 2170 return format("[testSingleFieldOperator] Unexpected header. Operator: %s; Usecase: %s; Actual: '%s'; Expected: '%s'", 2171 op.name, hc, actual, expected); 2172 } 2173 2174 string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex, size_t fieldIndex, 2175 const char[] actual, const char[] expected) 2176 { 2177 return format("[testSingleFieldOperator] Unexpected value. Operator: %s; Usecase: %s; RowIndex: %d, FieldIndex: %d\n Actual: '%s'; Expected: '%s'", 2178 op.name, hc, rowIndex, fieldIndex, actual, expected); 2179 } 2180 2181 /* Run the logic for each header use case. */ 2182 foreach (hc; EnumMembers!HeaderUsecase) 2183 { 2184 bool hasInputHeader = ( 2185 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2186 hc == HeaderUsecase.HeaderLine_CustomHeader 2187 ); 2188 bool hasOutputHeader = ( 2189 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2190 hc == HeaderUsecase.HeaderLine_CustomHeader || 2191 hc == HeaderUsecase.NoHeaderLine_DefaultHeader || 2192 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2193 ); 2194 bool hasCustomHeader = ( 2195 hc == HeaderUsecase.HeaderLine_CustomHeader || 2196 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2197 ); 2198 2199 if (hasCustomHeader) assert(hasOutputHeader); 2200 2201 auto op = new OperatorClass(fieldIndex, missingPolicy, extraOpInitArgs); 2202 2203 if (hasCustomHeader) 2204 { 2205 if (!op.allowCustomHeader) continue; // Custom header not support by this operator 2206 op.setCustomHeader(customOutputFieldHeader); 2207 } 2208 2209 Operator[] operatorArray; 2210 operatorArray ~= op; 2211 2212 auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy); 2213 summarizer.setOperators(inputRangeObject(operatorArray)); 2214 2215 if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine); 2216 2217 if (hasOutputHeader) 2218 { 2219 /* Write the header line. Note that this is a one-field header, */ 2220 auto headerLineOutput = appender!(char[])(); 2221 summarizer.writeSummaryHeader(headerLineOutput, printOptions); 2222 2223 /* Test that the header was generated correctly. 2224 * 2225 * Note: Because the output is generated by a Summarizer, it will have a 2226 * trailing newline. Use chomp to trim it. 2227 */ 2228 final switch (hc) 2229 { 2230 case HeaderUsecase.HeaderLine_DefaultHeader: 2231 assert(headerLineOutput.data.chomp == outputFieldHeaderFromHeaderLine, 2232 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2233 outputFieldHeaderFromHeaderLine)); 2234 break; 2235 case HeaderUsecase.NoHeaderLine_DefaultHeader: 2236 assert(headerLineOutput.data.chomp == outputFieldHeaderWithNoHeaderLine, 2237 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2238 outputFieldHeaderWithNoHeaderLine)); 2239 break; 2240 case HeaderUsecase.HeaderLine_CustomHeader: 2241 case HeaderUsecase.NoHeaderLine_CustomHeader: 2242 assert(headerLineOutput.data.chomp == customOutputFieldHeader, 2243 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2244 customOutputFieldHeader)); 2245 break; 2246 case HeaderUsecase.NoHeaderLine_NoOutputHeader: 2247 break; 2248 } 2249 2250 } 2251 2252 /* For each line, process the line, generate the output, and test that the 2253 * value is correct. Start with the empty file case. 2254 */ 2255 foreach (i, const char[] expected; expectedValues) 2256 { 2257 if (i > 0) summarizer.processNextLine(splitFile[i - 1]); 2258 auto summaryLineOutput = appender!(char[])(); 2259 summarizer.writeSummaryBody(summaryLineOutput, printOptions); 2260 assert(summaryLineOutput.data.chomp == expected, 2261 valueAssertMessage(operatorArray[0], hc, i, fieldIndex, 2262 summaryLineOutput.data.chomp, expectedValues[i])); 2263 } 2264 } 2265 } 2266 } 2267 2268 /** ZeroFieldOperator is a base class for operators that take no input. The main use 2269 * case is the CountOperator, which counts the occurrences of each unique key. Other 2270 * uses are possible, for example, weighted random number assignment. 2271 * 2272 * The primary rationale for ZeroFieldOperator and ZeroFieldCalculator is to clarify 2273 * the information available to such a routine. In particular, the split fields passed 2274 * to processHeaderLine and processNextLine don't include all fields in the input, 2275 * something that might not be obvious when implementing an operator. (Only fields 2276 * required by operators acting on specific fields are included.) 2277 */ 2278 class ZeroFieldOperator : Operator 2279 { 2280 import std.typecons : Flag; 2281 2282 private string _name; 2283 private string _header; 2284 2285 this(string operatorName) 2286 { 2287 _name = operatorName; 2288 _header = operatorName; 2289 } 2290 2291 void setCustomHeader (string customHeader) 2292 { 2293 _header = customHeader; 2294 } 2295 2296 bool allowCustomHeader() const @property 2297 { 2298 return true; 2299 } 2300 2301 final string name() const @property 2302 { 2303 return _name; 2304 } 2305 2306 final string header() const @property 2307 { 2308 return _header; 2309 } 2310 2311 /* A no-op. ZeroFieldOperators have no access to the header line. */ 2312 final void processHeaderLine(const char[][] fields) { } 2313 2314 /* A no-op. ZeroFieldOperators have no access to fields. */ 2315 final size_t[] numericFieldsToSave() 2316 { 2317 size_t[] emptyArray; 2318 return emptyArray; 2319 } 2320 2321 /* A no-op. ZeroFieldOperators have no access to fields. */ 2322 final size_t[] textFieldsToSave() 2323 { 2324 size_t[] emptyArray; 2325 return emptyArray; 2326 } 2327 2328 abstract ZeroFieldCalculator makeCalculator(); 2329 } 2330 2331 /** ZeroFieldCalculator is a base class for operators that don't use fields as input. 2332 * In particular, the Count operator. It is a companion to the ZeroFieldOperator class. 2333 * 2334 * Derived classes implement processNextEntry() rather than processNextLine(), and the 2335 * single argument form of calculate() given as an abstract function. 2336 */ 2337 class ZeroFieldCalculator : Calculator 2338 { 2339 this() { } 2340 2341 final void processNextLine(const char[][] fields) 2342 { 2343 debug writefln("[%s]", __FUNCTION__,); 2344 processNextEntry(); 2345 } 2346 2347 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2348 { 2349 return calculate(printOptions); 2350 } 2351 2352 abstract void processNextEntry(); 2353 abstract string calculate(const ref SummarizerPrintOptions printOptions); 2354 } 2355 2356 version(unittest) 2357 { 2358 /* A helper for ZeroFieldOperator unit tests. 2359 * 2360 * testZeroFieldOperator takes a set of split file values, a default header, and a 2361 * set of expected values. The expected values array contains the expected values 2362 * after each line. 2363 * 2364 * testZeroFieldOperator is very similar to testSingleFieldOperator, except that 2365 * there is no use of field indices and fewer types of headers. See the latter's 2366 * documentation and the CountOperator unit tests for examples. 2367 */ 2368 void testZeroFieldOperator(OperatorClass : ZeroFieldOperator) 2369 (const char[][][] splitFile, string defaultHeader, const char[][] expectedValues) 2370 { 2371 import std.format : format; 2372 import std.array : appender; 2373 import std.string : chomp; 2374 import std.traits : EnumMembers; 2375 2376 auto numFields = (splitFile[0]).length; 2377 2378 assert(splitFile.length + 1 == expectedValues.length, 2379 format("[testZeroFieldOperator] Need one more expected value than number of rows. headerSuffix: %s", 2380 defaultHeader)); 2381 2382 /* printOptions - Not used these tests, but needed for API calls. */ 2383 auto printOptions = SummarizerPrintOptions('#', '|'); 2384 2385 /* Missing policy doesn't apply to zero field operators, but need the object for the summarizer. */ 2386 auto missingPolicy = new MissingFieldPolicy; 2387 2388 /* An input header line. */ 2389 string[] inputHeaderLine = new string[numFields]; 2390 foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string; 2391 2392 auto customOutputFieldHeader = "custom"; 2393 2394 enum HeaderUsecase { 2395 HeaderLine_DefaultHeader, 2396 HeaderLine_CustomHeader, 2397 NoHeaderLine_DefaultHeader, 2398 NoHeaderLine_CustomHeader, 2399 NoHeaderLine_NoOutputHeader, 2400 } 2401 2402 string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected) 2403 { 2404 return format("[testZeroFieldOperator] Unexpected header. Operator: %s; Usecase: %s; Actual: '%s'; Expected: '%s'", 2405 op.name, hc, actual, expected); 2406 } 2407 2408 string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex, 2409 const char[] actual, const char[] expected) 2410 { 2411 return format("[testZeroFieldOperator] Unexpected value. Operator: %s; Usecase: %s; RowIndex: %d\n Actual: '%s'; Expected: '%s'", 2412 op.name, hc, rowIndex, actual, expected); 2413 } 2414 2415 /* Run the logic for each header use case. */ 2416 foreach (hc; EnumMembers!HeaderUsecase) 2417 { 2418 bool hasInputHeader = ( 2419 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2420 hc == HeaderUsecase.HeaderLine_CustomHeader 2421 ); 2422 bool hasOutputHeader = ( 2423 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2424 hc == HeaderUsecase.HeaderLine_CustomHeader || 2425 hc == HeaderUsecase.NoHeaderLine_DefaultHeader || 2426 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2427 ); 2428 bool hasCustomHeader = ( 2429 hc == HeaderUsecase.HeaderLine_CustomHeader || 2430 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2431 ); 2432 2433 if (hasCustomHeader) assert(hasOutputHeader); 2434 2435 auto op = new OperatorClass(); 2436 2437 if (hasCustomHeader) 2438 { 2439 if (!op.allowCustomHeader) continue; // Custom header not support by this operator 2440 op.setCustomHeader(customOutputFieldHeader); 2441 } 2442 2443 Operator[] operatorArray; 2444 operatorArray ~= op; 2445 2446 auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy); 2447 summarizer.setOperators(inputRangeObject(operatorArray)); 2448 if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine); 2449 2450 if (hasOutputHeader) 2451 { 2452 /* Write the header line. Note that this is a one-field header, */ 2453 auto headerLineOutput = appender!(char[])(); 2454 summarizer.writeSummaryHeader(headerLineOutput, printOptions); 2455 2456 /* Test that the header was generated correctly. 2457 * 2458 * Note: Because the output is generated by a Summarizer, it will have a 2459 * trailing newline. Use chomp to trim it. 2460 */ 2461 final switch (hc) 2462 { 2463 case HeaderUsecase.HeaderLine_DefaultHeader: 2464 case HeaderUsecase.NoHeaderLine_DefaultHeader: 2465 assert(headerLineOutput.data.chomp == defaultHeader, 2466 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2467 defaultHeader)); 2468 break; 2469 case HeaderUsecase.HeaderLine_CustomHeader: 2470 case HeaderUsecase.NoHeaderLine_CustomHeader: 2471 assert(headerLineOutput.data.chomp == customOutputFieldHeader, 2472 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2473 customOutputFieldHeader)); 2474 break; 2475 case HeaderUsecase.NoHeaderLine_NoOutputHeader: 2476 break; 2477 } 2478 2479 } 2480 2481 /* For each line, process the line, generate the output, and test that the 2482 * value is correct. Start with the empty file case. 2483 */ 2484 foreach (i, const char[] expected; expectedValues) 2485 { 2486 if (i > 0) summarizer.processNextLine(splitFile[i - 1]); 2487 auto summaryLineOutput = appender!(char[])(); 2488 summarizer.writeSummaryBody(summaryLineOutput, printOptions); 2489 assert(summaryLineOutput.data.chomp == expected, 2490 valueAssertMessage(operatorArray[0], hc, i, 2491 summaryLineOutput.data.chomp, expectedValues[i])); 2492 } 2493 } 2494 } 2495 } 2496 2497 /* Specific operators. 2498 * 2499 * Notes: 2500 * - The 'Calculator' inner classes are 'static'. This means inner class instances do not 2501 * keep a reference to the context of the outer class. In exchange, Calculator instances 2502 * need to hold all needed state, typically the field index they are summarizing. 2503 */ 2504 2505 /** CountOperator counts the number of occurrences of each unique key, or the number of 2506 * input lines if there is no unique key. 2507 * 2508 * CountOperator differs from most other operators in that it doesn't summarize a specific 2509 * field on the line. Instead it is summarizing a property of the unique key itself. For 2510 * this reason it doesn't derive from SingleFieldOperator. 2511 */ 2512 final class CountOperator : ZeroFieldOperator 2513 { 2514 this() 2515 { 2516 super("count"); 2517 } 2518 2519 final override ZeroFieldCalculator makeCalculator() 2520 { 2521 return new CountCalculator(); 2522 } 2523 2524 static final class CountCalculator : ZeroFieldCalculator 2525 { 2526 private size_t _count = 0; 2527 2528 final override void processNextEntry() 2529 { 2530 _count++; 2531 } 2532 2533 final override string calculate(const ref SummarizerPrintOptions printOptions) 2534 { 2535 return printOptions.formatNumber(_count); 2536 } 2537 } 2538 } 2539 2540 unittest // CountOperator 2541 { 2542 auto col1File = [["10"], ["9.5"], ["11"]]; 2543 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 2544 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 2545 2546 testZeroFieldOperator!CountOperator(col1File, "count", ["0", "1", "2", "3"]); 2547 testZeroFieldOperator!CountOperator(col2File, "count", ["0", "1", "2", "3"]); 2548 testZeroFieldOperator!CountOperator(col3File, "count", ["0", "1", "2", "3"]); 2549 } 2550 2551 /** RetainOperator retains the first occurrence of a field, without changing the header. 2552 * 2553 * RetainOperator is intended for fields where the value is expected to be the same for 2554 * all occurrences of the unique key, and the goal is to pass the value through unchanged. 2555 * It is like FirstOperator, except that the original header is preserved. The original 2556 * header preservation is setup in the call to the SingleFieldOperation constructor. 2557 * 2558 * Notes: 2559 * - An option to signal an error if multiple values are encountered might be useful. 2560 */ 2561 final class RetainOperator : SingleFieldOperator 2562 { 2563 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2564 { 2565 super("retain", fieldIndex, missingPolicy, No.useHeaderSuffix, No.allowCustomHeader); 2566 } 2567 2568 final override SingleFieldCalculator makeCalculator() 2569 { 2570 return new RetainCalculator(fieldIndex); 2571 } 2572 2573 final class RetainCalculator : SingleFieldCalculator 2574 { 2575 private bool _done = false; 2576 private string _value = ""; 2577 2578 this(size_t fieldIndex) 2579 { 2580 super(fieldIndex); 2581 } 2582 2583 final override RetainOperator getOperator() 2584 { 2585 return this.outer; 2586 } 2587 2588 final override void processNextField(const char[] nextField) 2589 { 2590 if (!_done) 2591 { 2592 _value = nextField.to!string; 2593 _done = true; 2594 } 2595 } 2596 2597 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2598 { 2599 return _value; 2600 } 2601 } 2602 } 2603 2604 unittest // RetainOperator 2605 { 2606 auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]]; 2607 auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]]; 2608 auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]]; 2609 2610 testSingleFieldOperator!RetainOperator(col1File, 0, "", ["", "r1c1", "r1c1", "r1c1"]); 2611 testSingleFieldOperator!RetainOperator(col2File, 0, "", ["", "r1c1", "r1c1", "r1c1"]); 2612 testSingleFieldOperator!RetainOperator(col2File, 1, "", ["", "r1c2", "r1c2", "r1c2"]); 2613 testSingleFieldOperator!RetainOperator(col3File, 0, "", ["", "r1c1", "r1c1", "r1c1"]); 2614 testSingleFieldOperator!RetainOperator(col3File, 1, "", ["", "r1c2", "r1c2", "r1c2"]); 2615 testSingleFieldOperator!RetainOperator(col3File, 2, "", ["", "r1c3", "r1c3", "r1c3"]); 2616 2617 auto col1misFile = [[""], ["r2c1"], ["r3c1"]]; 2618 testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "", "r2c1", "r2c1"], 2619 new MissingFieldPolicy(true, "")); // Exclude missing 2620 testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "NA", "NA", "NA"], 2621 new MissingFieldPolicy(false, "NA")); // Replace missing 2622 } 2623 2624 /** FirstOperator outputs the first value found for the field. 2625 */ 2626 final class FirstOperator : SingleFieldOperator 2627 { 2628 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2629 { 2630 super("first", fieldIndex, missingPolicy); 2631 } 2632 2633 final override SingleFieldCalculator makeCalculator() 2634 { 2635 return new FirstCalculator(fieldIndex); 2636 } 2637 2638 final class FirstCalculator : SingleFieldCalculator 2639 { 2640 private bool _done = false; 2641 private string _value = ""; 2642 2643 this(size_t fieldIndex) 2644 { 2645 super(fieldIndex); 2646 } 2647 2648 final override FirstOperator getOperator() 2649 { 2650 return this.outer; 2651 } 2652 2653 final override void processNextField(const char[] nextField) 2654 { 2655 if (!_done) 2656 { 2657 _value = nextField.to!string; 2658 _done = true; 2659 } 2660 } 2661 2662 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2663 { 2664 return _value; 2665 } 2666 } 2667 } 2668 2669 unittest // FirstOperator 2670 { 2671 auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]]; 2672 auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]]; 2673 auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]]; 2674 2675 testSingleFieldOperator!FirstOperator(col1File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]); 2676 testSingleFieldOperator!FirstOperator(col2File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]); 2677 testSingleFieldOperator!FirstOperator(col2File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]); 2678 testSingleFieldOperator!FirstOperator(col3File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]); 2679 testSingleFieldOperator!FirstOperator(col3File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]); 2680 testSingleFieldOperator!FirstOperator(col3File, 2, "first", ["", "r1c3", "r1c3", "r1c3"]); 2681 2682 auto col1misFile = [[""], ["r2c1"], ["r3c1"]]; 2683 testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "", "r2c1", "r2c1"], 2684 new MissingFieldPolicy(true, "")); // Exclude missing 2685 testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "NA", "NA", "NA"], 2686 new MissingFieldPolicy(false, "NA")); // Replace missing 2687 } 2688 2689 /** LastOperator outputs the last value found for the field. 2690 */ 2691 final class LastOperator : SingleFieldOperator 2692 { 2693 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2694 { 2695 super("last", fieldIndex, missingPolicy); 2696 } 2697 2698 final override SingleFieldCalculator makeCalculator() 2699 { 2700 return new LastCalculator(fieldIndex); 2701 } 2702 2703 final class LastCalculator : SingleFieldCalculator 2704 { 2705 private string _value = ""; 2706 2707 this(size_t fieldIndex) 2708 { 2709 super(fieldIndex); 2710 } 2711 2712 final override LastOperator getOperator() 2713 { 2714 return this.outer; 2715 } 2716 2717 final override void processNextField(const char[] nextField) 2718 { 2719 _value = nextField.to!string; 2720 } 2721 2722 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2723 { 2724 return _value; 2725 } 2726 } 2727 } 2728 2729 unittest // LastOperator 2730 { 2731 auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]]; 2732 auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]]; 2733 auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]]; 2734 2735 testSingleFieldOperator!LastOperator(col1File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]); 2736 testSingleFieldOperator!LastOperator(col2File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]); 2737 testSingleFieldOperator!LastOperator(col2File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]); 2738 testSingleFieldOperator!LastOperator(col3File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]); 2739 testSingleFieldOperator!LastOperator(col3File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]); 2740 testSingleFieldOperator!LastOperator(col3File, 2, "last", ["", "r1c3", "r2c3", "r3c3"]); 2741 2742 auto col1misFile = [[""], ["r2c1"], ["r3c1"]]; 2743 testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "", "r2c1", "r3c1"], 2744 new MissingFieldPolicy(true, "")); // Exclude missing 2745 testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "NA", "r2c1", "r3c1"], 2746 new MissingFieldPolicy(false, "NA")); // Replace missing 2747 } 2748 2749 /** MinOperator output the minimum value for the field. This is a numeric operator. 2750 * 2751 * This operator returns the original string without additional numeric formatting. 2752 * This can be useful when joining back to the original data. This is different than 2753 * numeric operators that perform calculations. 2754 */ 2755 final class MinOperator : SingleFieldOperator 2756 { 2757 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2758 { 2759 super("min", fieldIndex, missingPolicy); 2760 } 2761 2762 final override SingleFieldCalculator makeCalculator() 2763 { 2764 return new MinCalculator(fieldIndex); 2765 } 2766 2767 final class MinCalculator : SingleFieldCalculator 2768 { 2769 private bool _isFirst = true; 2770 private double _value = double.nan; 2771 private string _originalString = "nan"; // Note: Cannot format floats at compile time (version 2.087) 2772 2773 this(size_t fieldIndex) 2774 { 2775 super(fieldIndex); 2776 } 2777 2778 final override MinOperator getOperator() 2779 { 2780 return this.outer; 2781 } 2782 2783 final override void processNextField(const char[] nextField) 2784 { 2785 double fieldValue = nextField.to!double; 2786 if (_isFirst) 2787 { 2788 _value = fieldValue; 2789 _originalString = nextField.to!string; 2790 _isFirst = false; 2791 } 2792 else if (fieldValue < _value) 2793 { 2794 _value = fieldValue; 2795 _originalString = nextField.to!string; 2796 } 2797 } 2798 2799 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2800 { 2801 return _originalString; 2802 } 2803 } 2804 } 2805 2806 unittest // MinOperator 2807 { 2808 auto col1File = [["10"], ["9.5"], ["11"]]; 2809 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 2810 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 2811 2812 testSingleFieldOperator!MinOperator(col1File, 0, "min", ["nan", "10", "9.5", "9.5"]); 2813 testSingleFieldOperator!MinOperator(col2File, 0, "min", ["nan", "20", "20", "20"]); 2814 testSingleFieldOperator!MinOperator(col2File, 1, "min", ["nan", "-30", "-30", "-31"]); 2815 testSingleFieldOperator!MinOperator(col3File, 0, "min", ["nan", "9009", "199", "199"]); 2816 testSingleFieldOperator!MinOperator(col3File, 1, "min", ["nan", "9", "0", "0"]); 2817 testSingleFieldOperator!MinOperator(col3File, 2, "min", ["nan", "-4.5", "-4.5", "-4.5"]); 2818 2819 auto col1misFile = [[""], ["10"], ["-10"]]; 2820 testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "nan", "10", "-10"], 2821 new MissingFieldPolicy(true, "")); // Exclude missing 2822 testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "5", "5", "-10"], 2823 new MissingFieldPolicy(false, "5")); // Replace missing 2824 } 2825 2826 /** MaxOperator output the maximum value for the field. This is a numeric operator. 2827 * 2828 * This operator returns the original string without additional numeric formatting. 2829 * This can be useful when joining back to the original data. This is different than 2830 * numeric operators that perform calculations. 2831 */ 2832 final class MaxOperator : SingleFieldOperator 2833 { 2834 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2835 { 2836 super("max", fieldIndex, missingPolicy); 2837 } 2838 2839 final override SingleFieldCalculator makeCalculator() 2840 { 2841 return new MaxCalculator(fieldIndex); 2842 } 2843 2844 final class MaxCalculator : SingleFieldCalculator 2845 { 2846 private bool _isFirst = true; 2847 private double _value = double.nan; 2848 private string _originalString = "nan"; // Note: Cannot format floats at compile time (version 2.087) 2849 2850 this(size_t fieldIndex) 2851 { 2852 super(fieldIndex); 2853 } 2854 2855 final override MaxOperator getOperator() 2856 { 2857 return this.outer; 2858 } 2859 2860 final override void processNextField(const char[] nextField) 2861 { 2862 double fieldValue = nextField.to!double; 2863 if (_isFirst) 2864 { 2865 _value = fieldValue; 2866 _originalString = nextField.to!string; 2867 _isFirst = false; 2868 } 2869 else if (fieldValue > _value) 2870 { 2871 _value = fieldValue; 2872 _originalString = nextField.to!string; 2873 } 2874 } 2875 2876 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2877 { 2878 return _originalString; 2879 } 2880 } 2881 } 2882 2883 unittest // MaxOperator 2884 { 2885 auto col1File = [["10"], ["9.5"], ["11"]]; 2886 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 2887 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 2888 2889 testSingleFieldOperator!MaxOperator(col1File, 0, "max", ["nan", "10", "10", "11"]); 2890 testSingleFieldOperator!MaxOperator(col2File, 0, "max", ["nan", "20", "21", "22"]); 2891 testSingleFieldOperator!MaxOperator(col2File, 1, "max", ["nan", "-30", "-29", "-29"]); 2892 testSingleFieldOperator!MaxOperator(col3File, 0, "max", ["nan", "9009", "9009", "9009"]); 2893 testSingleFieldOperator!MaxOperator(col3File, 1, "max", ["nan", "9", "9", "9"]); 2894 testSingleFieldOperator!MaxOperator(col3File, 2, "max", ["nan", "-4.5", "-0.5", "12"]); 2895 2896 auto col1misFile = [[""], ["-10"], ["10"]]; 2897 testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "nan", "-10", "10"], 2898 new MissingFieldPolicy(true, "")); // Exclude missing 2899 testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "5", "5", "10"], 2900 new MissingFieldPolicy(false, "5")); // Replace missing 2901 } 2902 2903 /** RangeOperator outputs the difference between the minimum and maximum values. 2904 * 2905 * If there is a single value, or all values are the same, the range is zero. This is 2906 * a numeric operator. 2907 */ 2908 final class RangeOperator : SingleFieldOperator 2909 { 2910 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2911 { 2912 super("range", fieldIndex, missingPolicy); 2913 } 2914 2915 final override SingleFieldCalculator makeCalculator() 2916 { 2917 return new RangeCalculator(fieldIndex); 2918 } 2919 2920 final class RangeCalculator : SingleFieldCalculator 2921 { 2922 private bool _isFirst = true; 2923 private double _minValue = 0.0; 2924 private double _maxValue = 0.0; 2925 2926 this(size_t fieldIndex) 2927 { 2928 super(fieldIndex); 2929 } 2930 2931 final override RangeOperator getOperator() 2932 { 2933 return this.outer; 2934 } 2935 2936 final override void processNextField(const char[] nextField) 2937 { 2938 double fieldValue = nextField.to!double; 2939 if (_isFirst) 2940 { 2941 _minValue = _maxValue = fieldValue; 2942 _isFirst = false; 2943 } 2944 else if (fieldValue > _maxValue) 2945 { 2946 _maxValue = fieldValue; 2947 } 2948 else if (fieldValue < _minValue) 2949 { 2950 _minValue = fieldValue; 2951 } 2952 } 2953 2954 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2955 { 2956 return printOptions.formatNumber(_maxValue - _minValue); 2957 } 2958 } 2959 } 2960 2961 unittest // RangeOperator 2962 { 2963 auto col1File = [["10"], ["9.5"], ["11"]]; 2964 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 2965 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 2966 2967 testSingleFieldOperator!RangeOperator(col1File, 0, "range", ["0", "0", "0.5", "1.5"]); 2968 testSingleFieldOperator!RangeOperator(col2File, 0, "range", ["0", "0", "1", "2"]); 2969 testSingleFieldOperator!RangeOperator(col2File, 1, "range", ["0", "0", "1", "2"]); 2970 testSingleFieldOperator!RangeOperator(col3File, 0, "range", ["0", "0", "8810", "8810"]); 2971 testSingleFieldOperator!RangeOperator(col3File, 1, "range", ["0", "0", "9", "9"]); 2972 testSingleFieldOperator!RangeOperator(col3File, 2, "range", ["0", "0", "4", "16.5"]); 2973 2974 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]]; 2975 testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "0", "0", "0.5", "1.5"], 2976 new MissingFieldPolicy(true, "")); // Exclude missing 2977 testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "4.5", "4.5", "4.5", "5.5"], 2978 new MissingFieldPolicy(false, "5.5")); // Replace missing 2979 } 2980 2981 /** SumOperator produces the sum of all the values. This is a numeric operator. 2982 */ 2983 final class SumOperator : SingleFieldOperator 2984 { 2985 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2986 { 2987 super("sum", fieldIndex, missingPolicy); 2988 } 2989 2990 final override SingleFieldCalculator makeCalculator() 2991 { 2992 return new SumCalculator(fieldIndex); 2993 } 2994 2995 final class SumCalculator : SingleFieldCalculator 2996 { 2997 private double _total = 0.0; 2998 2999 this(size_t fieldIndex) 3000 { 3001 super(fieldIndex); 3002 } 3003 3004 final override SumOperator getOperator() 3005 { 3006 return this.outer; 3007 } 3008 3009 final override void processNextField(const char[] nextField) 3010 { 3011 _total += nextField.to!double; 3012 } 3013 3014 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3015 { 3016 return printOptions.formatNumber(_total); 3017 } 3018 } 3019 } 3020 3021 unittest // SumOperator 3022 { 3023 auto col1File = [["10"], ["9.5"], ["11"]]; 3024 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3025 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 3026 3027 testSingleFieldOperator!SumOperator(col1File, 0, "sum", ["0", "10", "19.5", "30.5"]); 3028 testSingleFieldOperator!SumOperator(col2File, 0, "sum", ["0", "20", "41", "63"]); 3029 testSingleFieldOperator!SumOperator(col2File, 1, "sum", ["0", "-30", "-59", "-90"]); 3030 testSingleFieldOperator!SumOperator(col3File, 0, "sum", ["0", "9009", "9208", "12211"]); 3031 testSingleFieldOperator!SumOperator(col3File, 1, "sum", ["0", "9", "9", "9.2"]); 3032 testSingleFieldOperator!SumOperator(col3File, 2, "sum", ["0", "-4.5", "-5", "7"]); 3033 3034 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]]; 3035 testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "0", "10", "10", "19.5", "30.5"], 3036 new MissingFieldPolicy(true, "")); // Exclude missing 3037 testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "1.5", "11.5", "13", "22.5", "33.5"], 3038 new MissingFieldPolicy(false, "1.5")); // Replace missing 3039 } 3040 3041 /** MeanOperator produces the mean (average) of all the values. This is a numeric operator. 3042 */ 3043 final class MeanOperator : SingleFieldOperator 3044 { 3045 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3046 { 3047 super("mean", fieldIndex, missingPolicy); 3048 } 3049 3050 final override SingleFieldCalculator makeCalculator() 3051 { 3052 return new MeanCalculator(fieldIndex); 3053 } 3054 3055 final class MeanCalculator : SingleFieldCalculator 3056 { 3057 private double _total = 0.0; 3058 private size_t _count = 0; 3059 3060 this(size_t fieldIndex) 3061 { 3062 super(fieldIndex); 3063 } 3064 3065 final override MeanOperator getOperator() 3066 { 3067 return this.outer; 3068 } 3069 3070 final override void processNextField(const char[] nextField) 3071 { 3072 _total += nextField.to!double; 3073 _count++; 3074 } 3075 3076 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3077 { 3078 return printOptions.formatNumber( 3079 (_count > 0) ? (_total / _count.to!double) : double.nan); 3080 } 3081 } 3082 } 3083 3084 unittest // MeanOperator 3085 { 3086 auto col1File = [["10"], ["9.5"], ["7.5"]]; 3087 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3088 auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]]; 3089 3090 testSingleFieldOperator!MeanOperator(col1File, 0, "mean", ["nan", "10", "9.75", "9"]); 3091 testSingleFieldOperator!MeanOperator(col2File, 0, "mean", ["nan", "20", "20.5", "21"]); 3092 testSingleFieldOperator!MeanOperator(col2File, 1, "mean", ["nan", "-30", "-29.5", "-30"]); 3093 testSingleFieldOperator!MeanOperator(col3File, 0, "mean", ["nan", "9009", "4509", "4509"]); 3094 testSingleFieldOperator!MeanOperator(col3File, 1, "mean", ["nan", "9", "4.5", "2"]); 3095 testSingleFieldOperator!MeanOperator(col3File, 2, "mean", ["nan", "-4.5", "-3", "2"]); 3096 3097 auto col1misFile = [[""], ["6"], [""], ["14"], ["40"]]; 3098 testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "nan", "6", "6", "10", "20"], 3099 new MissingFieldPolicy(true, "")); // Exclude missing 3100 testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "0", "3", "2", "5", "12"], 3101 new MissingFieldPolicy(false, "0")); // Replace missing 3102 } 3103 3104 /** MedianOperator produces the median of all the values. This is a numeric operator. 3105 * 3106 * All the field values are stored in memory as part of this calculation. This is 3107 * handled by unique key value lists. 3108 */ 3109 final class MedianOperator : SingleFieldOperator 3110 { 3111 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3112 { 3113 super("median", fieldIndex, missingPolicy); 3114 setSaveFieldValuesNumeric(); 3115 } 3116 3117 final override SingleFieldCalculator makeCalculator() 3118 { 3119 return new MedianCalculator(fieldIndex); 3120 } 3121 3122 final class MedianCalculator : SingleFieldCalculator 3123 { 3124 this(size_t fieldIndex) 3125 { 3126 super(fieldIndex); 3127 } 3128 3129 final override MedianOperator getOperator() 3130 { 3131 return this.outer; 3132 } 3133 3134 /* Work is done by saving the field values. */ 3135 final override void processNextField(const char[] nextField) 3136 { } 3137 3138 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3139 { 3140 return printOptions.formatNumber(valuesLists.numericValuesMedian(fieldIndex)); 3141 } 3142 } 3143 } 3144 3145 unittest // MedianOperator 3146 { 3147 auto col1File = [["10"], ["9.5"], ["7.5"]]; 3148 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3149 auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]]; 3150 3151 testSingleFieldOperator!MedianOperator(col1File, 0, "median", ["nan", "10", "9.75", "9.5"]); 3152 testSingleFieldOperator!MedianOperator(col2File, 0, "median", ["nan", "20", "20.5", "21"]); 3153 testSingleFieldOperator!MedianOperator(col2File, 1, "median", ["nan", "-30", "-29.5", "-30"]); 3154 testSingleFieldOperator!MedianOperator(col3File, 0, "median", ["nan", "9009", "4509", "4509"]); 3155 testSingleFieldOperator!MedianOperator(col3File, 1, "median", ["nan", "9", "4.5", "0"]); 3156 testSingleFieldOperator!MedianOperator(col3File, 2, "median", ["nan", "-4.5", "-3", "-1.5"]); 3157 3158 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]]; 3159 testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "nan", "10", "10", "9.75", "9.5"], 3160 new MissingFieldPolicy(true, "")); // Exclude missing 3161 testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "0", "5", "0", "4.75", "7.5"], 3162 new MissingFieldPolicy(false, "0")); // Replace missing 3163 } 3164 3165 /** QuantileOperator produces the value representing the data at a cummulative probability. 3166 * This is a numeric operation. 3167 * 3168 * As an example, quantiles might be produced for the 0.25, 0.5, and 0.75 probabilities 3169 * (alternately, the 25th, 50th, and 75th percentile ranks, the 50th percentile being the 3170 * median). Data is sorted is ascending order. This operator takes one percentile, but it 3171 * is common to generate multiple quantile ranks for the same field when summarizing. 3172 * 3173 * All the field's values are stored in memory as part of this calculation. This is 3174 * handled by unique key value lists. 3175 */ 3176 final class QuantileOperator : SingleFieldOperator 3177 { 3178 private double _prob; 3179 3180 this(size_t fieldIndex, MissingFieldPolicy missingPolicy, double probability) 3181 { 3182 assert(0.0 <= probability && probability <= 1.0); 3183 import std.format : format; 3184 3185 string header = (probability == 0.0) ? "pct0" : format("pct%02g", probability * 100.0); 3186 super(header, fieldIndex, missingPolicy); 3187 _prob = probability; 3188 setSaveFieldValuesNumeric(); 3189 } 3190 3191 final override SingleFieldCalculator makeCalculator() 3192 { 3193 return new QuantileCalculator(fieldIndex); 3194 } 3195 3196 final class QuantileCalculator : SingleFieldCalculator 3197 { 3198 this(size_t fieldIndex) 3199 { 3200 super(fieldIndex); 3201 } 3202 3203 final override QuantileOperator getOperator() 3204 { 3205 return this.outer; 3206 } 3207 3208 /* Work is done by saving the field values. */ 3209 final override void processNextField(const char[] nextField) 3210 { } 3211 3212 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3213 { 3214 import tsv_utils.common.numerics : quantile; 3215 return printOptions.formatNumber( 3216 quantile(this.outer._prob, valuesLists.numericValuesSorted(fieldIndex))); 3217 } 3218 } 3219 } 3220 3221 unittest // QuantileOperator 3222 { 3223 auto col1File = [["10"], ["9.5"], ["7.5"]]; 3224 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3225 auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]]; 3226 3227 auto defaultMissing = new MissingFieldPolicy; 3228 3229 /* Same as the median tests. */ 3230 testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct50", ["nan", "10", "9.75", "9.5"], defaultMissing, 0.50); 3231 testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct50", ["nan", "20", "20.5", "21"], defaultMissing, 0.50); 3232 testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct50", ["nan", "-30", "-29.5", "-30"], defaultMissing, 0.50); 3233 testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct50", ["nan", "9009", "4509", "4509"], defaultMissing, 0.50); 3234 testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct50", ["nan", "9", "4.5", "0"], defaultMissing, 0.50); 3235 testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct50", ["nan", "-4.5", "-3", "-1.5"], defaultMissing, 0.50); 3236 3237 /* The extremes (0, 1), are min and max. */ 3238 testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct0", ["nan", "10", "9.5", "7.5"], defaultMissing, 0.0); 3239 testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct0", ["nan", "20", "20", "20"], defaultMissing, 0.0); 3240 testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct0", ["nan", "-30", "-30", "-31"], defaultMissing, 0.0); 3241 testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct0", ["nan", "9009", "9", "9"], defaultMissing, 0.0); 3242 testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct0", ["nan", "9", "0", "-3"], defaultMissing, 0.0); 3243 testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct0", ["nan", "-4.5", "-4.5", "-4.5"], defaultMissing, 0.0); 3244 3245 testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct100", ["nan", "10", "10", "10"], defaultMissing, 1.0); 3246 testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct100", ["nan", "20", "21", "22"], defaultMissing, 1.0); 3247 testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct100", ["nan", "-30", "-29", "-29"], defaultMissing, 1.0); 3248 testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct100", ["nan", "9009", "9009", "9009"], defaultMissing, 1.0); 3249 testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct100", ["nan", "9", "9", "9"], defaultMissing, 1.0); 3250 testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct100", ["nan", "-4.5", "-1.5", "12"], defaultMissing, 1.0); 3251 3252 /* For missing policies, re-use the median tests. */ 3253 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]]; 3254 testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "nan", "10", "10", "9.75", "9.5"], 3255 new MissingFieldPolicy(true, ""), 0.5); // Exclude missing 3256 testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "0", "5", "0", "4.75", "7.5"], 3257 new MissingFieldPolicy(false, "0"), 0.5); // Replace missing 3258 } 3259 3260 /** MadOperator produces the median absolute deviation from the median. This is a numeric 3261 * operation. 3262 * 3263 * The result is the raw MAD value, without a normalization applied. 3264 * 3265 * All the field values are stored in memory as part of this calculation. This is 3266 * handled by unique key value lists. 3267 */ 3268 final class MadOperator : SingleFieldOperator 3269 { 3270 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3271 { 3272 super("mad", fieldIndex, missingPolicy); 3273 setSaveFieldValuesNumeric(); 3274 } 3275 3276 final override SingleFieldCalculator makeCalculator() 3277 { 3278 return new MadCalculator(fieldIndex); 3279 } 3280 3281 final class MadCalculator : SingleFieldCalculator 3282 { 3283 this(size_t fieldIndex) 3284 { 3285 super(fieldIndex); 3286 } 3287 3288 final override MadOperator getOperator() 3289 { 3290 return this.outer; 3291 } 3292 3293 /* Work is done by saving the field values. */ 3294 final override void processNextField(const char[] nextField) 3295 { } 3296 3297 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3298 { 3299 import std.math : abs; 3300 import tsv_utils.common.numerics : rangeMedian; 3301 3302 auto median = valuesLists.numericValuesMedian(fieldIndex); 3303 auto values = valuesLists.numericValues(fieldIndex); 3304 auto medianDevs = new double[values.length]; 3305 foreach (size_t i, double v; values) 3306 medianDevs[i] = abs(v - median); 3307 3308 return printOptions.formatNumber(medianDevs.rangeMedian); 3309 } 3310 } 3311 } 3312 3313 unittest // MadOperator 3314 { 3315 auto col1File = [["10"], ["15"], ["20"], ["25"], ["30"]]; 3316 auto col2File = [["2", "50"], ["2", "51"], ["2", "52"]]; 3317 auto col3File = [["16", "8", "-4"], ["8", "8", "-2"], ["8", "16", "0"]]; 3318 3319 testSingleFieldOperator!MadOperator(col1File, 0, "mad", ["nan", "0", "2.5", "5", "5", "5"]); 3320 testSingleFieldOperator!MadOperator(col2File, 0, "mad", ["nan", "0", "0", "0"]); 3321 testSingleFieldOperator!MadOperator(col2File, 1, "mad", ["nan", "0", "0.5", "1"]); 3322 testSingleFieldOperator!MadOperator(col3File, 0, "mad", ["nan", "0", "4", "0"]); 3323 testSingleFieldOperator!MadOperator(col3File, 1, "mad", ["nan", "0", "0", "0"]); 3324 testSingleFieldOperator!MadOperator(col3File, 2, "mad", ["nan", "0", "1", "2"]); 3325 3326 auto col1misFile = [[""], ["16"], [""], ["32"], ["-4"]]; 3327 testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "nan", "0", "0", "8", "16"], 3328 new MissingFieldPolicy(true, "")); // Exclude missing 3329 testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "0", "8", "0", "8", "4"], 3330 new MissingFieldPolicy(false, "0")); // Replace missing 3331 } 3332 3333 /** Generates the variance of the fields values. This is a numeric operator. 3334 */ 3335 final class VarianceOperator : SingleFieldOperator 3336 { 3337 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3338 { 3339 super("var", fieldIndex, missingPolicy); 3340 } 3341 3342 final override SingleFieldCalculator makeCalculator() 3343 { 3344 return new VarianceCalculator(fieldIndex); 3345 } 3346 3347 final class VarianceCalculator : SingleFieldCalculator 3348 { 3349 private double _count = 0.0; 3350 private double _mean = 0.0; 3351 private double _m2 = 0.0; // Sum of squares of differences from current mean 3352 3353 this(size_t fieldIndex) 3354 { 3355 super(fieldIndex); 3356 } 3357 3358 final override VarianceOperator getOperator() 3359 { 3360 return this.outer; 3361 } 3362 3363 final override void processNextField(const char[] nextField) 3364 { 3365 _count += 1.0; 3366 double fieldValue = nextField.to!double; 3367 double delta = fieldValue - _mean; 3368 _mean += delta / _count; 3369 _m2 += delta * (fieldValue - _mean); 3370 } 3371 3372 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3373 { 3374 return printOptions.formatNumber( 3375 (_count >= 2.0) ? (_m2 / (_count - 1.0)) : double.nan); 3376 } 3377 } 3378 } 3379 3380 unittest // VarianceOperator 3381 { 3382 auto col1File = [["5"], ["10"], ["15"]]; 3383 auto col2File = [["-5", "-5"], ["-10", "0"], ["-15", "5"]]; 3384 auto col3File = [["1", "2", "100"], ["2", "3", "100"], ["3", "4", "103"]]; 3385 3386 testSingleFieldOperator!VarianceOperator(col1File, 0, "var", ["nan", "nan", "12.5", "25"]); 3387 testSingleFieldOperator!VarianceOperator(col2File, 0, "var", ["nan", "nan", "12.5", "25"]); 3388 testSingleFieldOperator!VarianceOperator(col2File, 1, "var", ["nan", "nan", "12.5", "25"]); 3389 testSingleFieldOperator!VarianceOperator(col3File, 0, "var", ["nan", "nan", "0.5", "1"]); 3390 testSingleFieldOperator!VarianceOperator(col3File, 1, "var", ["nan", "nan", "0.5", "1"]); 3391 testSingleFieldOperator!VarianceOperator(col3File, 2, "var", ["nan", "nan", "0", "3"]); 3392 3393 auto col1misFile = [["5"], ["10"], [""]]; 3394 testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "12.5"], 3395 new MissingFieldPolicy(true, "")); // Exclude missing 3396 testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "25"], 3397 new MissingFieldPolicy(false, "15")); // Replace missing 3398 } 3399 3400 /** Generates the standard deviation of the fields values. This is a numeric operator. 3401 */ 3402 final class StDevOperator : SingleFieldOperator 3403 { 3404 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3405 { 3406 super("stdev", fieldIndex, missingPolicy); 3407 } 3408 3409 final override SingleFieldCalculator makeCalculator() 3410 { 3411 return new StDevCalculator(fieldIndex); 3412 } 3413 3414 final class StDevCalculator : SingleFieldCalculator 3415 { 3416 private double _count = 0.0; 3417 private double _mean = 0.0; 3418 private double _m2 = 0.0; // Sum of squares of differences from current mean 3419 3420 this(size_t fieldIndex) 3421 { 3422 super(fieldIndex); 3423 } 3424 3425 final override StDevOperator getOperator() 3426 { 3427 return this.outer; 3428 } 3429 3430 final override void processNextField(const char[] nextField) 3431 { 3432 _count += 1.0; 3433 double fieldValue = nextField.to!double; 3434 double delta = fieldValue - _mean; 3435 _mean += delta / _count; 3436 _m2 += delta * (fieldValue - _mean); 3437 } 3438 3439 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3440 { 3441 import std.math : sqrt; 3442 return printOptions.formatNumber( 3443 (_count >= 2.0) ? (_m2 / (_count - 1.0)).sqrt : double.nan); 3444 } 3445 } 3446 } 3447 3448 /* StDevOperator unit tests - These would be improved with a tolerance option. 3449 */ 3450 unittest 3451 { 3452 auto col1File = [["1"], ["4"], ["7"]]; 3453 auto col2File = [["3", "3"], ["3", "9"], ["7", "15"]]; 3454 auto col3File = [["11", "10", "10"], ["24", "22", "25"], ["37", "34", "40"]]; 3455 3456 testSingleFieldOperator!StDevOperator(col1File, 0, "stdev", ["nan", "nan", "2.12132034356", "3"]); 3457 testSingleFieldOperator!StDevOperator(col2File, 0, "stdev", ["nan", "nan", "0", "2.30940107676"]); 3458 testSingleFieldOperator!StDevOperator(col2File, 1, "stdev", ["nan", "nan", "4.24264068712", "6"]); 3459 testSingleFieldOperator!StDevOperator(col3File, 0, "stdev", ["nan", "nan", "9.19238815543", "13"]); 3460 testSingleFieldOperator!StDevOperator(col3File, 1, "stdev", ["nan", "nan", "8.48528137424", "12"]); 3461 testSingleFieldOperator!StDevOperator(col3File, 2, "stdev", ["nan", "nan", "10.6066017178", "15"]); 3462 3463 auto col1misFile = [["1"], ["4"], [""]]; 3464 testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "2.12132034356"], 3465 new MissingFieldPolicy(true, "")); // Exclude missing 3466 testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "3"], 3467 new MissingFieldPolicy(false, "7")); // Replace missing 3468 } 3469 3470 /** UniqueCountOperator generates the number of unique values. Unique values are 3471 * based on exact text match calculation, not a numeric comparison. 3472 * 3473 * All the unique field values are stored in memory as part of this calculation. 3474 */ 3475 final class UniqueCountOperator : SingleFieldOperator 3476 { 3477 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3478 { 3479 super("unique_count", fieldIndex, missingPolicy); 3480 } 3481 3482 final override SingleFieldCalculator makeCalculator() 3483 { 3484 return new UniqueCountCalculator(fieldIndex); 3485 } 3486 3487 final class UniqueCountCalculator : SingleFieldCalculator 3488 { 3489 private bool[string] _values; 3490 3491 this(size_t fieldIndex) 3492 { 3493 super(fieldIndex); 3494 } 3495 3496 final override UniqueCountOperator getOperator() 3497 { 3498 return this.outer; 3499 } 3500 3501 final override void processNextField(const char[] nextField) 3502 { 3503 if (nextField !in _values) _values[nextField.to!string] = true; 3504 } 3505 3506 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3507 { 3508 return printOptions.formatNumber(_values.length); 3509 } 3510 } 3511 } 3512 3513 unittest // UniqueCount 3514 { 3515 auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]]; 3516 auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]]; 3517 auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]]; 3518 3519 testSingleFieldOperator!UniqueCountOperator(col1File, 0, "unique_count", ["0", "1", "2", "3", "3", "3", "3", "3", "4"]); 3520 testSingleFieldOperator!UniqueCountOperator(col2File, 0, "unique_count", ["0", "1", "2", "2"]); 3521 testSingleFieldOperator!UniqueCountOperator(col2File, 1, "unique_count", ["0", "1", "1", "2"]); 3522 testSingleFieldOperator!UniqueCountOperator(col3File, 0, "unique_count", ["0", "1", "2", "3"]); 3523 testSingleFieldOperator!UniqueCountOperator(col3File, 1, "unique_count", ["0", "1", "2", "2"]); 3524 testSingleFieldOperator!UniqueCountOperator(col3File, 2, "unique_count", ["0", "1", "2", "3"]); 3525 3526 auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]]; 3527 testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "0", "1", "1", "2", "3", "3", "3", "3", "3", "4"], 3528 new MissingFieldPolicy(true, "")); // Exclude missing 3529 3530 3531 testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "1", "2", "2", "3", "4", "4", "4", "4", "4", "5"], 3532 new MissingFieldPolicy(false, "XYZ")); // Replace missing 3533 } 3534 3535 /** MissingCountOperator generates the number of missing values. This overrides 3536 * the global missingFieldsPolicy. 3537 */ 3538 final class MissingCountOperator : SingleFieldOperator 3539 { 3540 private MissingFieldPolicy _globalMissingPolicy; 3541 3542 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3543 { 3544 _globalMissingPolicy = missingPolicy; 3545 super("missing_count", fieldIndex, new MissingFieldPolicy(false, "")); 3546 } 3547 3548 final override SingleFieldCalculator makeCalculator() 3549 { 3550 return new MissingCountCalculator(fieldIndex); 3551 } 3552 3553 final class MissingCountCalculator : SingleFieldCalculator 3554 { 3555 private size_t _missingCount = 0; 3556 3557 this(size_t fieldIndex) 3558 { 3559 super(fieldIndex); 3560 } 3561 3562 final override MissingCountOperator getOperator() 3563 { 3564 return this.outer; 3565 } 3566 3567 final override void processNextField(const char[] nextField) 3568 { 3569 if (this.outer._globalMissingPolicy.isMissingField(nextField)) _missingCount++; 3570 } 3571 3572 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3573 { 3574 return printOptions.formatNumber(_missingCount); 3575 } 3576 } 3577 } 3578 3579 unittest // MissingCount 3580 { 3581 auto col1File = [["a"], ["b"], [""], [" "], [""]]; 3582 auto col2File = [["abc", ""], ["", ""], ["def", ""]]; 3583 auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]]; 3584 3585 testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"]); 3586 testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"]); 3587 testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"]); 3588 testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"]); 3589 testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"]); 3590 testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"]); 3591 3592 auto excludeMissing = new MissingFieldPolicy(true, ""); 3593 auto replaceMissing = new MissingFieldPolicy(false, "X"); 3594 3595 testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], excludeMissing); 3596 testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], excludeMissing); 3597 testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], excludeMissing); 3598 testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], excludeMissing); 3599 testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], excludeMissing); 3600 testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], excludeMissing); 3601 3602 testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], replaceMissing); 3603 testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], replaceMissing); 3604 testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], replaceMissing); 3605 testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], replaceMissing); 3606 testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], replaceMissing); 3607 testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], replaceMissing); 3608 } 3609 3610 /** NotMissingCountOperator generates the number of not-missing values. This overrides 3611 * the global missingFieldsPolicy. 3612 */ 3613 final class NotMissingCountOperator : SingleFieldOperator 3614 { 3615 private MissingFieldPolicy _globalMissingPolicy; 3616 3617 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3618 { 3619 _globalMissingPolicy = missingPolicy; 3620 super("not_missing_count", fieldIndex, new MissingFieldPolicy(false, "")); 3621 } 3622 3623 final override SingleFieldCalculator makeCalculator() 3624 { 3625 return new NotMissingCountCalculator(fieldIndex); 3626 } 3627 3628 final class NotMissingCountCalculator : SingleFieldCalculator 3629 { 3630 private size_t _notMissingCount = 0; 3631 3632 this(size_t fieldIndex) 3633 { 3634 super(fieldIndex); 3635 } 3636 3637 final override NotMissingCountOperator getOperator() 3638 { 3639 return this.outer; 3640 } 3641 3642 final override void processNextField(const char[] nextField) 3643 { 3644 if (!this.outer._globalMissingPolicy.isMissingField(nextField)) _notMissingCount++; 3645 } 3646 3647 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3648 { 3649 return printOptions.formatNumber(_notMissingCount); 3650 } 3651 } 3652 } 3653 3654 unittest // NotMissingCount 3655 { 3656 auto col1File = [["a"], ["b"], [""], [" "], [""]]; 3657 auto col2File = [["abc", ""], ["", ""], ["def", ""]]; 3658 auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]]; 3659 3660 testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"]); 3661 testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"]); 3662 testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"]); 3663 testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"]); 3664 testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"]); 3665 testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"]); 3666 3667 auto excludeMissing = new MissingFieldPolicy(true, ""); 3668 auto replaceMissing = new MissingFieldPolicy(false, "X"); 3669 3670 testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], excludeMissing); 3671 testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], excludeMissing); 3672 testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], excludeMissing); 3673 testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], excludeMissing); 3674 testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], excludeMissing); 3675 testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], excludeMissing); 3676 3677 testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], replaceMissing); 3678 testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], replaceMissing); 3679 testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], replaceMissing); 3680 testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], replaceMissing); 3681 testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], replaceMissing); 3682 testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], replaceMissing); 3683 } 3684 3685 /** ModeOperator outputs the most frequent value seen. In the event of a tie, the 3686 * first value seen is produced. 3687 * 3688 * All the field values are stored in memory as part of this calculation. 3689 * 3690 */ 3691 final class ModeOperator : SingleFieldOperator 3692 { 3693 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3694 { 3695 super("mode", fieldIndex, missingPolicy); 3696 } 3697 3698 final override SingleFieldCalculator makeCalculator() 3699 { 3700 return new ModeCalculator(fieldIndex); 3701 } 3702 3703 final class ModeCalculator : SingleFieldCalculator 3704 { 3705 private size_t[string] _valueCounts; 3706 private Appender!(string[]) _uniqueValues; 3707 3708 this(size_t fieldIndex) 3709 { 3710 super(fieldIndex); 3711 } 3712 3713 final override ModeOperator getOperator() 3714 { 3715 return this.outer; 3716 } 3717 3718 final override void processNextField(const char[] nextField) 3719 { 3720 auto countPtr = (nextField in _valueCounts); 3721 3722 if (countPtr is null) 3723 { 3724 string value = nextField.to!string; 3725 _uniqueValues.put(value); 3726 _valueCounts[value] = 1; 3727 } 3728 else 3729 { 3730 (*countPtr)++; 3731 } 3732 } 3733 3734 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3735 { 3736 string modeValue = ""; 3737 size_t modeCount = 0; 3738 3739 foreach (value; _uniqueValues.data) 3740 { 3741 assert(value in _valueCounts); 3742 3743 auto count = _valueCounts[value]; 3744 3745 if (count > modeCount) 3746 { 3747 modeValue = value; 3748 modeCount = count; 3749 } 3750 } 3751 3752 return modeValue; 3753 } 3754 } 3755 } 3756 3757 unittest // ModeOperator 3758 { 3759 auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]]; 3760 auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]]; 3761 auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]]; 3762 3763 testSingleFieldOperator!ModeOperator(col1File, 0, "mode", ["", "a", "a", "a", "c", "b", "b", "b"]); 3764 testSingleFieldOperator!ModeOperator(col2File, 0, "mode", ["", "abc", "abc", "def"]); 3765 testSingleFieldOperator!ModeOperator(col2File, 1, "mode", ["", "pqr", "pqr", "pqr"]); 3766 testSingleFieldOperator!ModeOperator(col3File, 0, "mode", ["", "1.0", "1.0", "1.0"]); 3767 testSingleFieldOperator!ModeOperator(col3File, 1, "mode", ["", "1", "1", "a"]); 3768 testSingleFieldOperator!ModeOperator(col3File, 2, "mode", ["", "a", "a", "a"]); 3769 3770 auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]]; 3771 testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "", "a", "a", "a", "a", "c", "b", "b"], 3772 new MissingFieldPolicy(true, "")); // Exclude missing 3773 3774 3775 testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "X", "X", "X", "X", "X", "X", "X", "b"], 3776 new MissingFieldPolicy(false, "X")); // Replace missing 3777 } 3778 3779 /** ModeCountOperator outputs the count of the most frequent value seen. 3780 * 3781 * All the field values are stored in memory as part of this calculation. 3782 * 3783 */ 3784 final class ModeCountOperator : SingleFieldOperator 3785 { 3786 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3787 { 3788 super("mode_count", fieldIndex, missingPolicy); 3789 } 3790 3791 final override SingleFieldCalculator makeCalculator() 3792 { 3793 return new ModeCountCalculator(fieldIndex); 3794 } 3795 3796 final class ModeCountCalculator : SingleFieldCalculator 3797 { 3798 private size_t[string] _valueCounts; 3799 3800 this(size_t fieldIndex) 3801 { 3802 super(fieldIndex); 3803 } 3804 3805 final override ModeCountOperator getOperator() 3806 { 3807 return this.outer; 3808 } 3809 3810 final override void processNextField(const char[] nextField) 3811 { 3812 auto countPtr = (nextField in _valueCounts); 3813 3814 if (countPtr is null) 3815 { 3816 string value = nextField.to!string; 3817 _valueCounts[value] = 1; 3818 } 3819 else 3820 { 3821 (*countPtr)++; 3822 } 3823 } 3824 3825 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3826 { 3827 size_t modeCount = 0; 3828 foreach (count; _valueCounts.byValue) if (count > modeCount) modeCount = count; 3829 return printOptions.formatNumber(modeCount); 3830 } 3831 } 3832 } 3833 3834 unittest // ModeCountOperator 3835 { 3836 auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]]; 3837 auto col2File = [["abc", ""], ["def", ""], ["def", "xyz"]]; 3838 auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]]; 3839 3840 testSingleFieldOperator!ModeCountOperator(col1File, 0, "mode_count", ["0", "1", "1", "1", "2", "2", "3", "3"]); 3841 testSingleFieldOperator!ModeCountOperator(col2File, 0, "mode_count", ["0", "1", "1", "2"]); 3842 testSingleFieldOperator!ModeCountOperator(col2File, 1, "mode_count", ["0", "1", "2", "2"]); 3843 testSingleFieldOperator!ModeCountOperator(col3File, 0, "mode_count", ["0", "1", "1", "1"]); 3844 testSingleFieldOperator!ModeCountOperator(col3File, 1, "mode_count", ["0", "1", "1", "2"]); 3845 testSingleFieldOperator!ModeCountOperator(col3File, 2, "mode_count", ["0", "1", "1", "1"]); 3846 3847 auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]]; 3848 testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "0", "1", "1", "1", "1", "2", "2", "3"], 3849 new MissingFieldPolicy(true, "")); // Exclude missing 3850 3851 3852 testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "1", "1", "2", "2", "2", "2", "2", "3"], 3853 new MissingFieldPolicy(false, "X")); // Replace missing 3854 } 3855 3856 /** ValuesOperator outputs each value delimited by an alternate delimiter character. 3857 * 3858 * All the field values are stored in memory as part of this calculation. This is 3859 * handled by unique key value lists. 3860 */ 3861 3862 final class ValuesOperator : SingleFieldOperator 3863 { 3864 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3865 { 3866 super("values", fieldIndex, missingPolicy); 3867 setSaveFieldValuesText(); 3868 } 3869 3870 final override SingleFieldCalculator makeCalculator() 3871 { 3872 return new ValuesCalculator(fieldIndex); 3873 } 3874 3875 final class ValuesCalculator : SingleFieldCalculator 3876 { 3877 this(size_t fieldIndex) 3878 { 3879 super(fieldIndex); 3880 } 3881 3882 final override ValuesOperator getOperator() 3883 { 3884 return this.outer; 3885 } 3886 3887 /* Work is done by saving the field values. */ 3888 final override void processNextField(const char[] nextField) 3889 { } 3890 3891 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3892 { 3893 return valuesLists.textValues(fieldIndex).join(printOptions.valuesDelimiter); 3894 } 3895 } 3896 } 3897 3898 unittest // ValuesOperator 3899 { 3900 auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]]; 3901 auto col2File = [["", "50"], ["", "51"], ["xyz", "52"]]; 3902 auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "---"]]; 3903 3904 testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e|", "a||b|cd|e||a"]); 3905 testSingleFieldOperator!ValuesOperator(col2File, 0, "values", ["", "", "|", "||xyz"]); 3906 testSingleFieldOperator!ValuesOperator(col2File, 1, "values", ["", "50", "50|51", "50|51|52"]); 3907 testSingleFieldOperator!ValuesOperator(col3File, 0, "values", ["", "z", "z|y", "z|y|w"]); 3908 testSingleFieldOperator!ValuesOperator(col3File, 1, "values", ["", "a", "a|ab", "a|ab|ba"]); 3909 testSingleFieldOperator!ValuesOperator(col3File, 2, "values", ["", "-", "-|--", "-|--|---"]); 3910 3911 testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e|a"], 3912 new MissingFieldPolicy(true, "")); // Exclude missing 3913 3914 3915 testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e|X", "a|X|b|cd|e|X|a"], 3916 new MissingFieldPolicy(false, "X")); // Replace missing 3917 } 3918 3919 /** UniqueValuesOperator outputs each unique value delimited by an alternate delimiter 3920 * character. Values are output in the order seen. 3921 * 3922 * All unique field values are stored in memory as part of this calculation. 3923 * 3924 */ 3925 final class UniqueValuesOperator : SingleFieldOperator 3926 { 3927 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3928 { 3929 super("unique_values", fieldIndex, missingPolicy); 3930 } 3931 3932 final override SingleFieldCalculator makeCalculator() 3933 { 3934 return new UniqueValuesCalculator(fieldIndex); 3935 } 3936 3937 final class UniqueValuesCalculator : SingleFieldCalculator 3938 { 3939 private size_t[string] _valuesHash; 3940 private Appender!(string[]) _uniqueValues; 3941 3942 this(size_t fieldIndex) 3943 { 3944 super(fieldIndex); 3945 } 3946 3947 final override UniqueValuesOperator getOperator() 3948 { 3949 return this.outer; 3950 } 3951 3952 final override void processNextField(const char[] nextField) 3953 { 3954 auto ptr = (nextField in _valuesHash); 3955 3956 if (ptr is null) 3957 { 3958 string value = nextField.to!string; 3959 _uniqueValues.put(value); 3960 _valuesHash[value] = 1; 3961 } 3962 } 3963 3964 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3965 { 3966 return _uniqueValues.data.join(printOptions.valuesDelimiter); 3967 } 3968 } 3969 } 3970 3971 unittest // UniqueValuesOperator 3972 { 3973 auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]]; 3974 auto col2File = [["", "50"], ["", "50"], ["xyz", "52"]]; 3975 auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "-"]]; 3976 3977 testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e", "a||b|cd|e"]); 3978 testSingleFieldOperator!UniqueValuesOperator(col2File, 0, "unique_values", ["", "", "", "|xyz"]); 3979 testSingleFieldOperator!UniqueValuesOperator(col2File, 1, "unique_values", ["", "50", "50", "50|52"]); 3980 testSingleFieldOperator!UniqueValuesOperator(col3File, 0, "unique_values", ["", "z", "z|y", "z|y|w"]); 3981 testSingleFieldOperator!UniqueValuesOperator(col3File, 1, "unique_values", ["", "a", "a|ab", "a|ab|ba"]); 3982 testSingleFieldOperator!UniqueValuesOperator(col3File, 2, "unique_values", ["", "-", "-|--", "-|--"]); 3983 3984 testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e"], 3985 new MissingFieldPolicy(true, "")); // Exclude missing 3986 3987 3988 testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e", "a|X|b|cd|e"], 3989 new MissingFieldPolicy(false, "X")); // Replace missing 3990 }