1 /** 2 Command line tool that reads TSV files and summarizes field values associated with 3 equivalent keys. 4 5 Copyright (c) 2016-2020, eBay Inc. 6 Initially written by Jon Degenhardt 7 8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 9 */ 10 module tsv_utils.tsv_summarize; 11 12 import std.algorithm : all, any, canFind, each, find, findSplit, map, joiner, splitter; 13 import std.array : join; 14 import std.conv : to; 15 import std.exception : enforce; 16 import std.format : format; 17 import std.range; 18 import std.stdio; 19 import std.typecons : tuple; 20 import std.container : DList; 21 22 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 23 24 version(unittest) 25 { 26 // When running unit tests, use main from -main compiler switch. 27 } 28 else 29 { 30 int main(string[] cmdArgs) 31 { 32 /* When running in DMD code coverage mode, turn on report merging. */ 33 version(D_Coverage) version(DigitalMars) 34 { 35 import core.runtime : dmd_coverSetMerge; 36 dmd_coverSetMerge(true); 37 } 38 39 TsvSummarizeOptions cmdopt; 40 auto r = cmdopt.processArgs(cmdArgs); 41 if (!r[0]) return r[1]; 42 version(LDC_Profile) 43 { 44 import ldc.profile : resetAll; 45 resetAll(); 46 } 47 try tsvSummarize(cmdopt); 48 catch (Exception exc) 49 { 50 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 51 return 1; 52 } 53 return 0; 54 } 55 } 56 57 auto helpTextVerbose = q"EOS 58 Synopsis: tsv-summarize [options] file [file...] 59 60 tsv-summarize reads tabular data files (tab-separated by default), tracks 61 field values for each unique key, and runs summarization algorithms. Consider 62 the file data.tsv: 63 64 make color time 65 ford blue 131 66 chevy green 124 67 ford red 128 68 bmw black 118 69 bmw black 126 70 ford blue 122 71 72 The min and average times for each make is generated by the command: 73 74 $ tsv-summarize --header --group-by 1 --min 3 --mean 3 data.tsv 75 76 This produces: 77 78 make time_min time_mean 79 ford 122 127 80 chevy 124 124 81 bmw 118 122 82 83 Using '--group 1,2' will group by both 'make' and 'color'. Omitting the 84 '--group-by' entirely summarizes fields for full file. 85 86 The program tries to generate useful headers, but custom headers can be 87 specified. Example (using -g and -H shortcuts for --header and --group-by): 88 89 $ tsv-summarize -H -g 1 --min 3:fastest --mean 3:average data.tsv 90 91 Most operators take custom headers in a similarly way, generally following: 92 93 --<operator-name> FIELD[:header] 94 95 Operators can be specified multiple times. They can also take multiple 96 fields (though not when a custom header is specified). Examples: 97 98 --median 2,3,4 99 --median 2-5,7-11 100 101 The quantile operator requires one or more probabilities after the fields: 102 103 --quantile 2:0.25 // Quantile 1 of field 2 104 --quantile 2-4:0.25,0.5,0.75 // Q1, Median, Q3 of fields 2, 3, 4 105 106 Summarization operators available are: 107 count range mad values 108 retain sum var unique-values 109 first mean stddev unique-count 110 last median mode missing-count 111 min quantile mode-count not-missing-count 112 max 113 114 Calculated numeric values are printed to 12 significant digits by default. 115 This can be changed using the '--p|float-precision' option. If six or less 116 it sets the number of significant digits after the decimal point. If 117 greater than six it sets the total number of significant digits. 118 119 Calculations hold onto the minimum data needed while reading data. A few 120 operations like median keep all data values in memory. These operations will 121 start to encounter performance issues as available memory becomes scarce. The 122 size that can be handled effectively is machine dependent, but often quite 123 large files can be handled. 124 125 Operations requiring numeric entries will signal an error and terminate 126 processing if a non-numeric entry is found. 127 128 Missing values are not treated specially by default, this can be changed 129 using the '--x|exclude-missing' or '--r|replace-missing' option. The former 130 turns off processing for missing values, the latter uses a replacement value. 131 132 Options: 133 EOS"; 134 135 auto helpText = q"EOS 136 Synopsis: tsv-summarize [options] file [file...] 137 138 tsv-summarize runs aggregation operations on fields in tab-separated value 139 files. Operations can be run against the full input data or grouped by key 140 fields. Use --help-verbose for more extensive help. 141 142 Options: 143 EOS"; 144 145 /** Command line options - Container and processing. The processArgs method is used to 146 * process the command line. 147 */ 148 struct TsvSummarizeOptions { 149 import tsv_utils.common.utils : byLineSourceRange, ByLineSourceRange; 150 151 string programName; // Program name 152 ByLineSourceRange!() inputSources; // Input Files 153 size_t[] keyFields; // -g, --group-by 154 bool hasHeader = false; // --header 155 bool writeHeader = false; // -w, --write-header 156 char inputFieldDelimiter = '\t'; // --d|delimiter 157 char valuesDelimiter = '|'; // --v|values-delimiter 158 size_t floatPrecision = 12; // --p|float-precision 159 bool excludeMissing = false; // --x|exclude-missing 160 string missingValueReplacement; // --r|replace-missing 161 bool helpVerbose = false; // --help-verbose 162 bool versionWanted = false; // --V|version 163 DList!Operator operators; // Operators, in the order specified. 164 size_t endFieldIndex = 0; // Derived value. Max field index used plus one. 165 MissingFieldPolicy globalMissingPolicy = new MissingFieldPolicy; // Derived value. 166 167 /* Returns a tuple. First value is true if command line arguments were successfully 168 * processed and execution should continue, or false if an error occurred or the user 169 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 170 * 171 * Returning true (execution continues) means args have been validated and derived 172 * values calculated. In addition, field indices have been converted to zero-based. 173 */ 174 auto processArgs (ref string[] cmdArgs) { 175 import std.algorithm : any, each; 176 import std.getopt; 177 import std.path : baseName, stripExtension; 178 import std.typecons : Yes, No; 179 import tsv_utils.common.getopt_inorder; 180 import tsv_utils.common.utils : makeFieldListOptionHandler; 181 182 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 183 184 try 185 { 186 arraySep = ","; // Use comma to separate values in command line options 187 auto r = getoptInorder( 188 cmdArgs, 189 "help-verbose", " Print full help.", &helpVerbose, 190 191 std.getopt.config.caseSensitive, 192 "V|version", " Print version information and exit.", &versionWanted, 193 std.getopt.config.caseInsensitive, 194 195 "g|group-by", "<field-list> Fields to use as key.", 196 keyFields.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex), 197 198 std.getopt.config.caseSensitive, 199 "H|header", " Treat the first line of each file as a header.", &hasHeader, 200 std.getopt.config.caseInsensitive, 201 202 "w|write-header", " Write an output header even if there is no input header.", &writeHeader, 203 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &inputFieldDelimiter, 204 "v|values-delimiter", "CHR Values delimiter. Default: vertical bar (|). (Single byte UTF-8 characters only.)", &valuesDelimiter, 205 "p|float-precision", "NUM 'Precision' to use printing floating point numbers. Affects the number of digits printed and exponent use. Default: 12", &floatPrecision, 206 "x|exclude-missing", " Exclude missing (empty) fields from calculations.", &excludeMissing, 207 "r|replace-missing", "STR Replace missing (empty) fields with STR in calculations.", &missingValueReplacement, 208 "count", " Count occurrences of each unique key ('--g|group-by'), or the total number of records if no key field is specified.", &countOptionHandler, 209 "count-header", "STR Count occurrences of each unique key, like '--count', but use STR as the header.", &countHeaderOptionHandler, 210 "retain", "<field-list> Retain one copy of the field.", &operatorOptionHandler!RetainOperator, 211 "first", "<field-list>[:STR] First value seen.", &operatorOptionHandler!FirstOperator, 212 "last", "<field-list>[:STR] Last value seen.", &operatorOptionHandler!LastOperator, 213 "min", "<field-list>[:STR] Min value. (Numeric fields only.)", &operatorOptionHandler!MinOperator, 214 "max", "<field-list>[:STR] Max value. (Numeric fields only.)", &operatorOptionHandler!MaxOperator, 215 "range", "<field-list>[:STR] Difference between min and max values. (Numeric fields only.)", &operatorOptionHandler!RangeOperator, 216 "sum", "<field-list>[:STR] Sum of the values. (Numeric fields only.)", &operatorOptionHandler!SumOperator, 217 "mean", "<field-list>[:STR] Mean (average). (Numeric fields only.)", &operatorOptionHandler!MeanOperator, 218 "median", "<field-list>[:STR] Median value. (Numeric fields only. Reads all values into memory.)", &operatorOptionHandler!MedianOperator, 219 "quantile", "<field-list>:p[,p...][:STR] Quantiles. One or more fields, then one or more 0.0-1.0 probabilities. (Numeric fields only. Reads all values into memory.)", &quantileOperatorOptionHandler, 220 "mad", "<field-list>[:STR] Median absolute deviation from the median. Raw value, not scaled. (Numeric fields only. Reads all values into memory.)", &operatorOptionHandler!MadOperator, 221 "var", "<field-list>[:STR] Variance. (Sample variance, numeric fields only).", &operatorOptionHandler!VarianceOperator, 222 "stdev", "<field-list>[:STR] Standard deviation. (Sample st.dev, numeric fields only).", &operatorOptionHandler!StDevOperator, 223 "mode", "<field-list>[:STR] Mode. The most frequent value. (Reads all unique values into memory.)", &operatorOptionHandler!ModeOperator, 224 "mode-count", "<field-list>[:STR] Count of the most frequent value. (Reads all unique values into memory.)", &operatorOptionHandler!ModeCountOperator, 225 "unique-count", "<field-list>[:STR] Number of unique values. (Reads all unique values into memory.)", &operatorOptionHandler!UniqueCountOperator, 226 "missing-count", "<field-list>[:STR] Number of missing (empty) fields. Not affected by '--x|exclude-missing' or '--r|replace-missing'.", &operatorOptionHandler!MissingCountOperator, 227 "not-missing-count", "<field-list>[:STR] Number of filled (non-empty) fields. Not affected by '--r|replace-missing'.", &operatorOptionHandler!NotMissingCountOperator, 228 "values", "<field-list>[:STR] All the values, separated by --v|values-delimiter. (Reads all values into memory.)", &operatorOptionHandler!ValuesOperator, 229 "unique-values", "<field-list>[:STR] All the unique values, separated by --v|values-delimiter. (Reads all unique values into memory.)", &operatorOptionHandler!UniqueValuesOperator, 230 ); 231 232 if (r.helpWanted) 233 { 234 defaultGetoptPrinter(helpText, r.options); 235 return tuple(false, 0); 236 } 237 else if (helpVerbose) 238 { 239 defaultGetoptPrinter(helpTextVerbose, r.options); 240 return tuple(false, 0); 241 } 242 else if (versionWanted) 243 { 244 import tsv_utils.common.tsvutils_version; 245 writeln(tsvutilsVersionNotice("tsv-summarize")); 246 return tuple(false, 0); 247 } 248 249 consistencyValidations(); 250 251 /* Remaining command line args are files. Use standard input if files 252 * were not provided. Truncate cmdArgs to consume the arguments. 253 */ 254 string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 255 cmdArgs.length = 1; 256 inputSources = byLineSourceRange(filepaths); 257 258 derivations(); 259 } 260 catch (Exception exc) 261 { 262 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 263 return tuple(false, 1); 264 } 265 return tuple(true, 0); 266 } 267 268 /* operationOptionHandler functions are callbacks that process command line options 269 * specifying summarization operations. eg. '--max 5', '--last 3:LastEntry'. Handlers 270 * check syntactic correctness and instantiate Operator objects that do the work. This 271 * is also where 1-upped field numbers are converted to 0-based indices. 272 */ 273 private void operatorOptionHandler(OperatorClass : SingleFieldOperator)(string option, string optionVal) 274 { 275 import std.range : enumerate; 276 import std.typecons : Yes, No; 277 import tsv_utils.common.utils : parseFieldList; 278 279 auto valSplit = findSplit(optionVal, ":"); 280 281 enforce(!valSplit[0].empty && (valSplit[1].empty || !valSplit[2].empty), 282 format("Invalid option value: '--%s %s'. Expected: '--%s <field-list>' or '--%s <field>:<header>'.", 283 option, optionVal, option, option)); 284 285 try foreach (fieldNum, fieldIndex; 286 valSplit[0].to!string 287 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex).enumerate(1)) 288 { 289 auto op = new OperatorClass(fieldIndex, globalMissingPolicy); 290 291 if (!valSplit[2].empty) // Header specified 292 { 293 enforce(fieldNum <= 1, 294 format("Invalid option: '--%s %s'. Cannot specify a custom header when using multiple fields.", 295 option, optionVal)); 296 297 enforce(op.allowCustomHeader, 298 format("Invalid option: '--%s %s'. Operator does not support custom headers.", 299 option, optionVal)); 300 301 op.setCustomHeader(valSplit[2].to!string); 302 } 303 304 operators.insertBack(op); 305 if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1; 306 } 307 catch (Exception exc) 308 { 309 import std.format : format; 310 exc.msg = format("[--%s] %s", option, exc.msg); 311 throw exc; 312 } 313 } 314 315 /* QuantileOperator has a different syntax and needs a custom command option handler. */ 316 private void quantileOperatorOptionHandler(string option, string optionVal) 317 { 318 import std.typecons : Yes, No; 319 import tsv_utils.common.utils : parseFieldList; 320 321 auto formatErrorMsg(string option, string optionVal) 322 { 323 return format( 324 "Invalid option value: '--%s %s'. Expected: '--%s <field-list>:<prob>[,<prob>]' or '--%s <field>:<prob>:<header>' where <prob> is a number between 0.0 and 1.0.", 325 option, optionVal, option, option); 326 } 327 328 auto split1 = findSplit(optionVal, ":"); 329 330 enforce(!split1[0].empty && (split1[1].empty || !split1[2].empty), 331 formatErrorMsg(option, optionVal)); 332 333 auto split2 = findSplit(split1[2], ":"); 334 335 enforce(!split2[0].empty && (split2[1].empty || !split2[2].empty), 336 formatErrorMsg(option, optionVal)); 337 338 auto fieldStr = split1[0]; 339 auto probStr = split2[0]; 340 auto header = split2[2]; 341 342 size_t[] fieldIndices; 343 double[] probs; 344 345 try foreach (fieldIndex; 346 fieldStr.to!string.parseFieldList!(size_t, Yes.convertToZeroBasedIndex)) 347 { 348 fieldIndices ~= fieldIndex; 349 } 350 catch (Exception exc) 351 { 352 import std.format : format; 353 exc.msg = format("[--%s] %s", option, exc.msg); 354 throw exc; 355 } 356 357 foreach (str; probStr.splitter(',')) 358 { 359 double p; 360 361 try p = str.to!double; 362 catch (Exception exc) 363 throw new Exception(formatErrorMsg(option, optionVal)); 364 365 enforce(p >= 0.0 && p <= 1.0, 366 format("Invalid option: '--%s %s'. Probability '%g' is not in the interval [0.0,1.0].", 367 option, optionVal, p)); 368 369 probs ~= p; 370 } 371 372 enforce(header.empty || (fieldIndices.length <= 1 && probs.length <= 1), 373 format("Invalid option: '--%s %s'. Cannot specify a custom header when using multiple fields or multiple probabilities.", 374 option, optionVal)); 375 376 assert (fieldIndices.length > 0); 377 assert (probs.length > 0); 378 assert (header.empty || (fieldIndices.length == 1 && probs.length == 1)); 379 380 foreach (fieldIndex; fieldIndices) 381 { 382 foreach (p; probs) 383 { 384 auto op = new QuantileOperator(fieldIndex, globalMissingPolicy, p); 385 if (!header.empty) op.setCustomHeader(header); 386 operators.insertBack(op); 387 } 388 if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1; 389 } 390 } 391 392 private void countOptionHandler() 393 { 394 operators.insertBack(new CountOperator()); 395 } 396 397 private void countHeaderOptionHandler(string option, string optionVal) 398 { 399 auto op = new CountOperator(); 400 op.setCustomHeader(optionVal); 401 operators.insertBack(op); 402 } 403 404 /* This routine does validations not handled by processArgs. */ 405 private void consistencyValidations() 406 { 407 enforce(!operators.empty, "At least one summary operator is required."); 408 409 enforce(inputFieldDelimiter != valuesDelimiter, 410 "Cannot use the same character for both --d|field-delimiter and --v|values-delimiter."); 411 412 enforce(!(excludeMissing && missingValueReplacement.length != 0), 413 "Cannot use both '--x|exclude-missing' and '--r|replace-missing'."); 414 } 415 416 /* Post-processing derivations. */ 417 void derivations() 418 { 419 /* keyFields need to part of the endFieldIndex, which is one past the last field index. */ 420 keyFields.each!(delegate (size_t x) { if (x >= endFieldIndex) endFieldIndex = x + 1; } ); 421 422 /* Missing field policy. */ 423 globalMissingPolicy.updatePolicy(excludeMissing, missingValueReplacement); 424 } 425 } 426 427 /** tsvSummarize does the primary work of the tsv-summarize program. 428 */ 429 void tsvSummarize(ref TsvSummarizeOptions cmdopt) 430 { 431 import tsv_utils.common.utils : ByLineSourceRange, bufferedByLine, 432 throwIfWindowsNewlineOnUnix; 433 434 /* Check that the input files were setup as expected. Should at least have one 435 * input, stdin if nothing else, and newlines removed from the byLine range. 436 */ 437 assert(!cmdopt.inputSources.empty); 438 static assert(is(typeof(cmdopt.inputSources) == ByLineSourceRange!(No.keepTerminator))); 439 440 /* Pick the Summarizer based on the number of key-fields entered. */ 441 auto summarizer = 442 (cmdopt.keyFields.length == 0) 443 ? new NoKeySummarizer!(typeof(stdout.lockingTextWriter()))( 444 cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 445 446 : (cmdopt.keyFields.length == 1) 447 ? new OneKeySummarizer!(typeof(stdout.lockingTextWriter()))( 448 cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 449 450 : new MultiKeySummarizer!(typeof(stdout.lockingTextWriter()))( 451 cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy); 452 453 /* Add the operators to the Summarizer. */ 454 summarizer.setOperators(inputRangeObject(cmdopt.operators[])); 455 456 /* Process each input file, one line at a time. */ 457 auto lineFields = new char[][](cmdopt.endFieldIndex); 458 bool headerFound = false; 459 foreach (inputStream; cmdopt.inputSources) 460 { 461 foreach (lineNum, line; inputStream.byLine.enumerate(1)) 462 { 463 if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, lineNum); 464 465 /* Copy the needed number of fields to the fields array. 466 * Note: The number is zero if no operator needs fields. Notably, the count 467 * operator. Used by itself, it counts the number input lines (ala 'wc -l'). 468 */ 469 if (cmdopt.endFieldIndex > 0) 470 { 471 size_t fieldIndex = 0; 472 foreach (fieldValue; line.splitter(cmdopt.inputFieldDelimiter)) 473 { 474 if (fieldIndex == cmdopt.endFieldIndex) break; 475 lineFields[fieldIndex] = fieldValue; 476 fieldIndex++; 477 } 478 479 if (fieldIndex == 0) 480 { 481 assert(cmdopt.endFieldIndex > 0); 482 assert(line.length == 0); 483 484 /* Bug work-around. Empty lines are not handled properly by splitter. 485 * - Bug: https://issues.dlang.org/show_bug.cgi?id=15735 486 * - Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030 487 * This can arise for: '$ tsv-summarize -g 1 --count'. This counts the 488 * unique values in field 1. If there's only one column, then an empty 489 * line becomes an empty string for field 1. Work-around: Point to the 490 * line. It's an empty string. 491 */ 492 lineFields[fieldIndex] = line; 493 fieldIndex++; 494 } 495 496 enforce(fieldIndex >= cmdopt.endFieldIndex, 497 format("Not enough fields in line. File: %s, Line: %s", 498 inputStream.name, lineNum)); 499 } 500 501 if (cmdopt.hasHeader && lineNum == 1) 502 { 503 if (!headerFound) 504 { 505 summarizer.processHeaderLine(lineFields); 506 headerFound = true; 507 } 508 } 509 else 510 { 511 /* Process the line. Processing will fail (throw) if a field cannot be 512 * converted to the expected type. 513 */ 514 try summarizer.processNextLine(lineFields); 515 catch (Exception exc) 516 { 517 throw new Exception( 518 format("Could not process line or field: %s\n File: %s Line: %s%s", 519 exc.msg, inputStream.name, lineNum, 520 (lineNum == 1) ? "\n Is this a header line? Use --header to skip." : "")); 521 } 522 } 523 } 524 } 525 526 debug writeln("[tsvSummarize] After reading all data."); 527 528 /* Whew! We're done processing input data. Run the calculations and print. */ 529 auto printOptions = SummarizerPrintOptions( 530 cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision); 531 auto stdoutWriter = stdout.lockingTextWriter; 532 533 if (cmdopt.hasHeader || cmdopt.writeHeader) 534 { 535 summarizer.writeSummaryHeader(stdoutWriter, printOptions); 536 } 537 538 summarizer.writeSummaryBody(stdoutWriter, printOptions); 539 } 540 541 /** The default field header. This is used when the input doesn't have field headers, 542 * but field headers are used in the output. The default is "fieldN", where N is the 543 * 1-upped field number. 544 */ 545 string fieldHeaderFromIndex(size_t fieldIndex) 546 { 547 enum prefix = "field"; 548 return prefix ~ (fieldIndex + 1).to!string; 549 } 550 551 unittest 552 { 553 assert(fieldHeaderFromIndex(0) == "field1"); 554 assert(fieldHeaderFromIndex(10) == "field11"); 555 } 556 557 /** Produce a summary header from a field header. 558 * 559 * The result has the form `<fieldHeader>_<operation>`. e.g. If the field header is 560 * "length" and the operation is "max", the summary header is "length_max". The field 561 * header typically comes a header line in the input data or was constructed by 562 * fieldHeaderFromIndex(). 563 * 564 * If operationName is the empty string, then fieldHeader is used unchanged. This supports 565 * the Retain operator. 566 */ 567 string summaryHeaderFromFieldHeader(string fieldHeader, string operationName) 568 { 569 return (operationName.length > 0) ? fieldHeader ~ "_" ~ operationName : fieldHeader; 570 } 571 572 unittest 573 { 574 assert(summaryHeaderFromFieldHeader("originalfield", "mycalc") == "originalfield_mycalc"); 575 assert(summaryHeaderFromFieldHeader("originalfield", "") == "originalfield"); 576 } 577 578 /** SummarizerPrintOptions holds printing options for Summarizers and Calculators. Typically 579 * specified with command line options, it is separated out for modularity. 580 */ 581 struct SummarizerPrintOptions 582 { 583 char fieldDelimiter; 584 char valuesDelimiter; 585 size_t floatPrecision = 12; 586 587 import std.traits : isFloatingPoint, isIntegral; 588 589 auto formatNumber(T)(T n) const 590 if (isFloatingPoint!T || isIntegral!T) 591 { 592 import tsv_utils.common.numerics : formatNumber; 593 return formatNumber!T(n, floatPrecision); 594 } 595 } 596 597 /** A Summarizer object maintains the state of the summarization and performs basic 598 * processing. Handling of files and input lines is left to the caller. 599 * 600 * Classes supporting the Summarizer must implement the methods: 601 * - setOperators - Called after initializing the object for each operator to be processed. 602 * - processHeaderLine - Called to process the header line of each file. Returns true if 603 * it was the first header line processed (used when reading multiple files). 604 * - processNextLine - Called to process non-header lines. 605 * - writeSummaryHeader - Called to write the header line. 606 * - writeSummaryBody - Called to write the result lines. 607 * 608 */ 609 interface Summarizer(OutputRange) 610 { 611 /** Called after initializing the object for each operator to be processed. */ 612 void setOperators(InputRange!Operator op); 613 614 /** Called to process the header line of each file. Returns true if it was the 615 * first header line processed (used when reading multiple files). 616 */ 617 bool processHeaderLine(const char[][] lineFields); 618 619 /** Called to process non-header lines. */ 620 void processNextLine(const char[][] lineFields); 621 622 /** Called to write the header line. */ 623 void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions); 624 625 /** Called to write the result lines. */ 626 void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions); 627 } 628 629 /** SummarizerBase performs work shared by all sumarizers, most everything except for 630 * handling of unique keys. 631 * 632 * The base class handles creation, allocates storage for Operators and SharedFieldValues, 633 * and similar. Derived classes deal primarily with unique keys and the associated Calculators 634 * and UniqueKeyValuesLists. 635 */ 636 class SummarizerBase(OutputRange) : Summarizer!OutputRange 637 { 638 private char _inputFieldDelimiter; 639 private bool _hasProcessedFirstHeaderLine = false; 640 private SharedFieldValues _sharedFieldValues = null; // Null if no shared field value lists. 641 protected MissingFieldPolicy _missingPolicy; 642 protected DList!Operator _operators; 643 protected size_t _numOperators = 0; 644 645 this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 646 { 647 _inputFieldDelimiter = inputFieldDelimiter; 648 _missingPolicy = missingPolicy; 649 } 650 651 char inputFieldDelimiter() const @property 652 { 653 return _inputFieldDelimiter; 654 } 655 656 /** Sets the Operators used by the Summarizer. Called after construction. */ 657 void setOperators(InputRange!Operator operators) 658 { 659 foreach (op; operators) 660 { 661 _operators.insertBack(op); 662 _numOperators++; 663 auto numericFieldsToSave = op.numericFieldsToSave(); 664 auto textFieldsToSave = op.textFieldsToSave(); 665 666 if (numericFieldsToSave.length > 0 || textFieldsToSave.length > 0) 667 { 668 if (_sharedFieldValues is null) 669 { 670 _sharedFieldValues = new SharedFieldValues(); 671 } 672 numericFieldsToSave.each!(x => _sharedFieldValues.addNumericIndex(x)); 673 textFieldsToSave.each!(x => _sharedFieldValues.addTextIndex(x)); 674 } 675 } 676 } 677 678 /** Called to process the header line of each file. Returns true if it was the 679 * first header line processed (used when reading multiple files). 680 */ 681 bool processHeaderLine(const char[][] lineFields) 682 { 683 if (!_hasProcessedFirstHeaderLine) 684 { 685 _operators.each!(x => x.processHeaderLine(lineFields)); 686 _hasProcessedFirstHeaderLine = true; 687 return true; 688 } 689 else 690 { 691 return false; 692 } 693 } 694 695 protected final UniqueKeyValuesLists makeUniqueKeyValuesLists() 696 { 697 return (_sharedFieldValues is null) 698 ? null 699 : _sharedFieldValues.makeUniqueKeyValuesLists; 700 } 701 702 abstract void processNextLine(const char[][] lineFields); 703 abstract void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions); 704 abstract void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions); 705 } 706 707 /** The NoKeySummarizer is used when summarizing values across the entire input. 708 * 709 * Note: NoKeySummarizer is used in Operator unit tests and gets extensive testing 710 * through that mechanism. 711 */ 712 final class NoKeySummarizer(OutputRange) : SummarizerBase!OutputRange 713 { 714 private Calculator[] _calculators; 715 private UniqueKeyValuesLists _valueLists; 716 717 this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 718 { 719 super(inputFieldDelimiter, missingPolicy); 720 } 721 722 /** Called after initializing the object for each operator to be processed. */ 723 override void setOperators(InputRange!Operator operators) 724 { 725 super.setOperators(operators); 726 727 /* Only one Calculator per Operation, so create them as Operators are added. */ 728 foreach (op; operators) _calculators ~= op.makeCalculator; 729 _valueLists = super.makeUniqueKeyValuesLists(); 730 } 731 732 /** Called to process non-header lines. */ 733 override void processNextLine(const char[][] lineFields) 734 { 735 _calculators.each!(x => x.processNextLine(lineFields)); 736 if (_valueLists !is null) _valueLists.processNextLine(lineFields, _missingPolicy); 737 } 738 739 /** Called to write the header line. */ 740 override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 741 { 742 put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter)); 743 put(outputStream, '\n'); 744 } 745 746 /** Called to write the result lines. */ 747 override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 748 { 749 put(outputStream, 750 _calculators[] 751 .map!(x => x.calculate(_valueLists, printOptions)) 752 .join(printOptions.fieldDelimiter)); 753 put(outputStream, '\n'); 754 } 755 } 756 757 /** KeySummarizerBase does work shared by the single key and multi-key summarizers. 758 * 759 * The primary difference between those two is the formation of the key. The primary 760 * reason for separating those into two separate classes is to simplify (speed-up) 761 * handling of single field keys, which are the most common use case. 762 */ 763 class KeySummarizerBase(OutputRange) : SummarizerBase!OutputRange 764 { 765 protected struct UniqueKeyData 766 { 767 Calculator[] calculators; 768 UniqueKeyValuesLists valuesLists; 769 } 770 771 private DList!string _uniqueKeys; 772 private UniqueKeyData[string] _uniqueKeyData; 773 774 this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 775 { 776 super(inputFieldDelimiter, missingPolicy); 777 } 778 779 protected void processNextLineWithKey(T : const char[])(T key, const char[][] lineFields) 780 { 781 debug writefln("[%s]: %s", __FUNCTION__, lineFields.to!string); 782 783 auto dataPtr = (key in _uniqueKeyData); 784 auto data = (dataPtr is null) ? addUniqueKey(key.to!string) : *dataPtr; 785 786 data.calculators.each!(x => x.processNextLine(lineFields)); 787 if (data.valuesLists !is null) data.valuesLists.processNextLine(lineFields, _missingPolicy); 788 } 789 790 protected UniqueKeyData addUniqueKey(string key) 791 { 792 assert(key !in _uniqueKeyData); 793 794 _uniqueKeys.insertBack(key); 795 796 auto calculators = new Calculator[_numOperators]; 797 size_t i = 0; 798 foreach (op; _operators) 799 { 800 calculators[i] = op.makeCalculator; 801 i++; 802 } 803 804 return _uniqueKeyData[key] = UniqueKeyData(calculators, super.makeUniqueKeyValuesLists()); 805 } 806 807 override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 808 { 809 put(outputStream, keyFieldHeader()); 810 put(outputStream, printOptions.fieldDelimiter); 811 put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter)); 812 put(outputStream, '\n'); 813 } 814 815 override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 816 { 817 foreach(key; _uniqueKeys) 818 { 819 auto data = _uniqueKeyData[key]; 820 put(outputStream, key); 821 put(outputStream, printOptions.fieldDelimiter); 822 put(outputStream, 823 data.calculators[] 824 .map!(x => x.calculate(data.valuesLists, printOptions)) 825 .join(printOptions.fieldDelimiter)); 826 put(outputStream, '\n'); 827 } 828 } 829 830 abstract string keyFieldHeader() const @property; 831 } 832 833 /** This Summarizer is for the case where the unique key is based on exactly one field. 834 */ 835 final class OneKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange 836 { 837 private size_t _keyFieldIndex = 0; 838 private string _keyFieldHeader; 839 private DList!string _uniqueKeys; 840 841 this(size_t keyFieldIndex, char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 842 { 843 super(inputFieldDelimiter, missingPolicy); 844 _keyFieldIndex = keyFieldIndex; 845 _keyFieldHeader = fieldHeaderFromIndex(keyFieldIndex); 846 } 847 848 override string keyFieldHeader() const @property 849 { 850 return _keyFieldHeader; 851 } 852 853 override bool processHeaderLine(const char[][] lineFields) 854 { 855 assert(_keyFieldIndex <= lineFields.length); 856 857 bool isFirstHeaderLine = super.processHeaderLine(lineFields); 858 if (isFirstHeaderLine) 859 { 860 _keyFieldHeader = lineFields[_keyFieldIndex].to!string; 861 } 862 return isFirstHeaderLine; 863 } 864 865 override void processNextLine(const char[][] lineFields) 866 { 867 assert(_keyFieldIndex < lineFields.length); 868 processNextLineWithKey(lineFields[_keyFieldIndex], lineFields); 869 } 870 } 871 872 /** This Summarizer is for the case where the unique key is based on multiple fields. 873 */ 874 final class MultiKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange 875 { 876 private size_t[] _keyFieldIndices; 877 private string _keyFieldHeader; 878 private DList!string _uniqueKeys; 879 880 this(const size_t[] keyFieldIndices, char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 881 { 882 super(inputFieldDelimiter, missingPolicy); 883 _keyFieldIndices = keyFieldIndices.dup; 884 _keyFieldHeader = 885 _keyFieldIndices.map!(i => fieldHeaderFromIndex(i)) 886 .join(inputFieldDelimiter); 887 } 888 889 override string keyFieldHeader() const @property 890 { 891 return _keyFieldHeader; 892 } 893 894 override bool processHeaderLine(const char[][] lineFields) 895 { 896 assert(_keyFieldIndices.all!(x => x < lineFields.length)); 897 assert(_keyFieldIndices.length >= 2); 898 899 bool isFirstHeaderLine = super.processHeaderLine(lineFields); 900 if (isFirstHeaderLine) 901 { 902 _keyFieldHeader = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string; 903 } 904 return isFirstHeaderLine; 905 } 906 907 override void processNextLine(const char[][] lineFields) 908 { 909 assert(_keyFieldIndices.all!(x => x < lineFields.length)); 910 assert(_keyFieldIndices.length >= 2); 911 912 string key = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string; 913 processNextLineWithKey(key, lineFields); 914 } 915 } 916 917 version(unittest) 918 { 919 /* testSummarizer is a helper that can run many types of unit tests against 920 * Summarizers. It can also test operators, but there are separate helper functions 921 * better suited for that purpose. 922 * 923 * Arguments are a command line args, an input file, and expected output. The 924 * input file and expected output are already split into lines and fields, the helper 925 * manages re-assembly. The program name from the command line args is printed if an 926 * an error occurs, it is useful to identify the test that failed. 927 * 928 * Note: Much of this is a duplication tsvSummarize logic. Better abstraction of 929 * file input/output would enable running unit tests directly on top of tsvSummarize. 930 * 931 * Update (April 2020): With the introduction of InputSourceRange and ByLineSource, 932 * there needs to be a physical file when call processArgs. Its hard to get around, 933 * as the intent is to read the header line of the first input file during command 934 * line argument processing. Eventually this unit test process will need to be 935 * rewritten. For now, a file with the equivalent data is being added to the command 936 * line. 937 */ 938 void testSummarizer(string[] cmdArgs, string[][] file, string[][] expected) 939 { 940 import std.array : appender; 941 942 assert(cmdArgs.length > 0, "[testSummarizer] cmdArgs must not be empty."); 943 944 auto formatAssertMessage(T...)(string msg, T formatArgs) 945 { 946 auto formatString = "[testSummarizer] %s: " ~ msg; 947 return format(formatString, cmdArgs[0], formatArgs); 948 } 949 950 TsvSummarizeOptions cmdopt; 951 auto savedCmdArgs = cmdArgs.to!string; 952 auto r = cmdopt.processArgs(cmdArgs); 953 assert(r[0], formatAssertMessage("Invalid command line args: '%s'.", savedCmdArgs)); 954 955 assert(file.all!(line => line.length >= cmdopt.endFieldIndex), 956 formatAssertMessage("group-by or operator field number greater than number of fields a line of the input file.")); 957 958 /* Pick the Summarizer based on the number of key-fields entered. */ 959 auto summarizer = 960 (cmdopt.keyFields.length == 0) 961 ? new NoKeySummarizer!(typeof(appender!(char[])()))( 962 cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 963 964 : (cmdopt.keyFields.length == 1) 965 ? new OneKeySummarizer!(typeof(appender!(char[])()))( 966 cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 967 968 : new MultiKeySummarizer!(typeof(appender!(char[])()))( 969 cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy); 970 971 /* Add the operators to the Summarizer. */ 972 summarizer.setOperators(inputRangeObject(cmdopt.operators[])); 973 974 /* Process the file one line at a time. */ 975 auto lineFields = new char[][](cmdopt.endFieldIndex); 976 bool headerFound = false; 977 foreach (lineNum, line; file.enumerate(1)) 978 { 979 /* Copy the needed fields to the fields array. */ 980 foreach (i, val; line[0..cmdopt.endFieldIndex]) lineFields[i] = val.dup; 981 982 if (cmdopt.hasHeader && lineNum == 1) 983 { 984 if (!headerFound) 985 { 986 summarizer.processHeaderLine(lineFields); 987 headerFound = true; 988 } 989 } 990 else 991 { 992 try summarizer.processNextLine(lineFields); 993 catch (Exception exc) 994 { 995 assert(false, formatAssertMessage(exc.msg)); 996 } 997 } 998 } 999 auto printOptions = SummarizerPrintOptions( 1000 cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision); 1001 1002 auto summarizerOutput = appender!(char[])(); 1003 1004 if (cmdopt.hasHeader || cmdopt.writeHeader) 1005 { 1006 summarizer.writeSummaryHeader(summarizerOutput, printOptions); 1007 } 1008 1009 summarizer.writeSummaryBody(summarizerOutput, printOptions); 1010 auto expectedOutput = expected.map!(x => x.joiner(cmdopt.inputFieldDelimiter.to!string)).joiner("\n").to!string; 1011 if (expectedOutput.length > 0 && expectedOutput[$-1] != '\n') expectedOutput ~= "\n"; 1012 1013 assert(summarizerOutput.data == expectedOutput, 1014 formatAssertMessage( 1015 "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================", 1016 expectedOutput.to!string, summarizerOutput.data.to!string)); 1017 } 1018 1019 void writeDataFile(string filepath, string[][] fileData) 1020 { 1021 import std.algorithm; 1022 import std.stdio; 1023 1024 auto f = filepath.File("w"); 1025 foreach (record; fileData) f.writeln(record.joiner("\t")); 1026 f.close; 1027 } 1028 } 1029 1030 unittest 1031 { 1032 import tsv_utils.common.unittest_utils; // tsv unit test helpers, from common/src/. 1033 import std.file : mkdir, rmdirRecurse; 1034 import std.path : buildPath; 1035 1036 auto testDir = makeUnittestTempDir("tsv_summarizer"); 1037 scope(exit) testDir.rmdirRecurse; 1038 1039 /* Summarizer unit tests. Primarily single-key and multi-key summarizers. To a limited 1040 * extent, command line option handling (TsvSummarizeOptions). Individual operators 1041 * have separate tests, those tests test the no-key summarizer. The Values operator is 1042 * used in these tests. It engages a number of behaviors, and the results have limited 1043 * ambiguity. Using only one operator limits dependence on individual operators. 1044 * 1045 * Update (April 2020): There now needs to be a real file passed to testSummarizer. 1046 * See the comments with testSummarizer for details. 1047 */ 1048 1049 auto file1 = [["fld1", "fld2", "fld3"], 1050 ["a", "a", "3"], 1051 ["c", "a", "2b"], 1052 ["c", "bc", ""], 1053 ["a", "c", "2b"], 1054 ["", "bc", ""], 1055 ["c", "bc", "3"]]; 1056 1057 auto file1Path = buildPath(testDir, "file1.tsv"); 1058 auto file1NoHeaderPath = buildPath(testDir, "file1_noheader.tsv"); 1059 writeDataFile(file1Path, file1); 1060 writeDataFile(file1NoHeaderPath, file1[1 .. $]); 1061 1062 /* Single-key summarizer tests. 1063 */ 1064 testSummarizer(["unittest-sk-1", "--header", "--group-by", "1", "--values", "1", file1Path], 1065 file1, 1066 [["fld1", "fld1_values"], 1067 ["a", "a|a"], 1068 ["c", "c|c|c"], 1069 ["", ""]] 1070 ); 1071 testSummarizer(["unittest-sk-2", "-H", "--group-by", "1", "--values", "2", file1Path], 1072 file1, 1073 [["fld1", "fld2_values"], 1074 ["a", "a|c"], 1075 ["c", "a|bc|bc"], 1076 ["", "bc"]] 1077 ); 1078 testSummarizer(["unittest-sk-3", "-H", "-g", "1", "--values", "3", file1Path], 1079 file1, 1080 [["fld1", "fld3_values"], 1081 ["a", "3|2b"], 1082 ["c", "2b||3"], 1083 ["", ""]] 1084 ); 1085 testSummarizer(["unittest-sk-4", "-H", "--group-by", "1", "--values", "1,2,3", file1Path], 1086 file1, 1087 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1088 ["a", "a|a", "a|c", "3|2b"], 1089 ["c", "c|c|c", "a|bc|bc", "2b||3"], 1090 ["", "", "bc", ""]] 1091 ); 1092 testSummarizer(["unittest-sk-5", "-H", "--group-by", "1", "--values", "1-3", file1Path], 1093 file1, 1094 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1095 ["a", "a|a", "a|c", "3|2b"], 1096 ["c", "c|c|c", "a|bc|bc", "2b||3"], 1097 ["", "", "bc", ""]] 1098 ); 1099 testSummarizer(["unittest-sk-6", "-H", "--group-by", "1", "--values", "3,2,1", file1Path], 1100 file1, 1101 [["fld1", "fld3_values", "fld2_values", "fld1_values"], 1102 ["a", "3|2b", "a|c", "a|a"], 1103 ["c", "2b||3", "a|bc|bc", "c|c|c"], 1104 ["", "", "bc", ""]] 1105 ); 1106 testSummarizer(["unittest-sk-7", "-H", "--group-by", "1", "--values", "3-1", file1Path], 1107 file1, 1108 [["fld1", "fld3_values", "fld2_values", "fld1_values"], 1109 ["a", "3|2b", "a|c", "a|a"], 1110 ["c", "2b||3", "a|bc|bc", "c|c|c"], 1111 ["", "", "bc", ""]] 1112 ); 1113 testSummarizer(["unittest-sk-8", "-H", "--group-by", "2", "--values", "1", file1Path], 1114 file1, 1115 [["fld2", "fld1_values"], 1116 ["a", "a|c"], 1117 ["bc", "c||c"], 1118 ["c", "a"]] 1119 ); 1120 testSummarizer(["unittest-sk-9", "-H", "--group-by", "2", "--values", "2", file1Path], 1121 file1, 1122 [["fld2", "fld2_values"], 1123 ["a", "a|a"], 1124 ["bc", "bc|bc|bc"], 1125 ["c", "c"]] 1126 ); 1127 testSummarizer(["unittest-sk-10", "-H", "--group-by", "2", "--values", "3", file1Path], 1128 file1, 1129 [["fld2", "fld3_values"], 1130 ["a", "3|2b"], 1131 ["bc", "||3"], 1132 ["c", "2b"]] 1133 ); 1134 testSummarizer(["unittest-sk-11", "-H", "--group-by", "2", "--values", "1,3", file1Path], 1135 file1, 1136 [["fld2", "fld1_values", "fld3_values"], 1137 ["a", "a|c", "3|2b"], 1138 ["bc", "c||c", "||3"], 1139 ["c", "a", "2b"]] 1140 ); 1141 testSummarizer(["unittest-sk-12", "-H", "--group-by", "2", "--values", "3,1", file1Path], 1142 file1, 1143 [["fld2", "fld3_values", "fld1_values"], 1144 ["a", "3|2b", "a|c"], 1145 ["bc", "||3", "c||c"], 1146 ["c", "2b", "a"]] 1147 ); 1148 testSummarizer(["unittest-sk-13", "-H", "--group-by", "3", "--values", "1", file1Path], 1149 file1, 1150 [["fld3", "fld1_values"], 1151 ["3", "a|c"], 1152 ["2b", "c|a"], 1153 ["", "c|"]] 1154 ); 1155 testSummarizer(["unittest-sk-14", "-H", "--group-by", "3", "--values", "2", file1Path], 1156 file1, 1157 [["fld3", "fld2_values"], 1158 ["3", "a|bc"], 1159 ["2b", "a|c"], 1160 ["", "bc|bc"]] 1161 ); 1162 testSummarizer(["unittest-sk-15", "-H", "--group-by", "3", "--values", "1,2", file1Path], 1163 file1, 1164 [["fld3", "fld1_values", "fld2_values"], 1165 ["3", "a|c", "a|bc"], 1166 ["2b", "c|a", "a|c"], 1167 ["", "c|", "bc|bc"]] 1168 ); 1169 1170 /* Multi-key summarizer tests. 1171 */ 1172 testSummarizer(["unittest-mk-1", "--header", "--group-by", "1,2", "--values", "1", file1Path], 1173 file1, 1174 [["fld1", "fld2", "fld1_values"], 1175 ["a", "a", "a"], 1176 ["c", "a", "c"], 1177 ["c", "bc", "c|c"], 1178 ["a", "c", "a"], 1179 ["", "bc", ""]] 1180 ); 1181 testSummarizer(["unittest-mk-2", "-H", "--group-by", "1,2", "--values", "2", file1Path], 1182 file1, 1183 [["fld1", "fld2", "fld2_values"], 1184 ["a", "a", "a"], 1185 ["c", "a", "a"], 1186 ["c", "bc", "bc|bc"], 1187 ["a", "c", "c"], 1188 ["", "bc", "bc"]] 1189 ); 1190 testSummarizer(["unittest-mk-3", "-H", "--group-by", "1,2", "--values", "3", file1Path], 1191 file1, 1192 [["fld1", "fld2", "fld3_values"], 1193 ["a", "a", "3"], 1194 ["c", "a", "2b"], 1195 ["c", "bc", "|3"], 1196 ["a", "c", "2b"], 1197 ["", "bc", ""]] 1198 ); 1199 testSummarizer(["unittest-mk-4", "-H", "--group-by", "1,2", "--values", "3,1", file1Path], 1200 file1, 1201 [["fld1", "fld2", "fld3_values", "fld1_values"], 1202 ["a", "a", "3", "a"], 1203 ["c", "a", "2b", "c"], 1204 ["c", "bc", "|3", "c|c"], 1205 ["a", "c", "2b", "a"], 1206 ["", "bc", "", ""]] 1207 ); 1208 testSummarizer(["unittest-mk-5", "-H", "--group-by", "3,2", "--values", "1", file1Path], 1209 file1, 1210 [["fld3", "fld2", "fld1_values"], 1211 ["3", "a", "a"], 1212 ["2b", "a", "c"], 1213 ["", "bc", "c|"], 1214 ["2b", "c", "a"], 1215 ["3", "bc", "c"]] 1216 ); 1217 testSummarizer(["unittest-mk-6", "-H", "--group-by", "3-2", "--values", "1", file1Path], 1218 file1, 1219 [["fld3", "fld2", "fld1_values"], 1220 ["3", "a", "a"], 1221 ["2b", "a", "c"], 1222 ["", "bc", "c|"], 1223 ["2b", "c", "a"], 1224 ["3", "bc", "c"]] 1225 ); 1226 testSummarizer(["unittest-mk-7", "-H", "--group-by", "2,1,3", "--values", "2", file1Path], 1227 file1, 1228 [["fld2", "fld1", "fld3", "fld2_values"], 1229 ["a", "a", "3", "a"], 1230 ["a", "c", "2b", "a"], 1231 ["bc", "c", "", "bc"], 1232 ["c", "a", "2b", "c"], 1233 ["bc", "", "", "bc"], 1234 ["bc", "c", "3", "bc"]] 1235 ); 1236 1237 /* Missing policies. */ 1238 testSummarizer(["unittest-mis-1", "--header", "--group-by", "1", "--values", "1", "--exclude-missing", file1Path], 1239 file1, 1240 [["fld1", "fld1_values"], 1241 ["a", "a|a"], 1242 ["c", "c|c|c"], 1243 ["", ""]] 1244 ); 1245 testSummarizer(["unittest-mis-2", "-H", "--group-by", "1", "--values", "2", "-x", file1Path], 1246 file1, 1247 [["fld1", "fld2_values"], 1248 ["a", "a|c"], 1249 ["c", "a|bc|bc"], 1250 ["", "bc"]] 1251 ); 1252 testSummarizer(["unittest-mis-3", "-H", "-g", "1", "--values", "3", "-x", file1Path], 1253 file1, 1254 [["fld1", "fld3_values"], 1255 ["a", "3|2b"], 1256 ["c", "2b|3"], 1257 ["", ""]] 1258 ); 1259 testSummarizer(["unittest-mis-4", "-H", "--group-by", "1", "--values", "1,2,3", "-x", file1Path], 1260 file1, 1261 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1262 ["a", "a|a", "a|c", "3|2b"], 1263 ["c", "c|c|c", "a|bc|bc", "2b|3"], 1264 ["", "", "bc", ""]] 1265 ); 1266 testSummarizer(["unittest-mis-5", "--header", "--group-by", "1", "--values", "1", "--replace-missing", "NA", file1Path], 1267 file1, 1268 [["fld1", "fld1_values"], 1269 ["a", "a|a"], 1270 ["c", "c|c|c"], 1271 ["", "NA"]] 1272 ); 1273 testSummarizer(["unittest-mis-6", "-H", "--group-by", "1", "--values", "2", "-r", "NA", file1Path], 1274 file1, 1275 [["fld1", "fld2_values"], 1276 ["a", "a|c"], 1277 ["c", "a|bc|bc"], 1278 ["", "bc"]] 1279 ); 1280 testSummarizer(["unittest-mis-7", "-H", "-g", "1", "--values", "3", "-r", "NA", file1Path], 1281 file1, 1282 [["fld1", "fld3_values"], 1283 ["a", "3|2b"], 1284 ["c", "2b|NA|3"], 1285 ["", "NA"]] 1286 ); 1287 testSummarizer(["unittest-mis-8", "-H", "--group-by", "1", "--values", "1,2,3", "-r", "NA", file1Path], 1288 file1, 1289 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1290 ["a", "a|a", "a|c", "3|2b"], 1291 ["c", "c|c|c", "a|bc|bc", "2b|NA|3"], 1292 ["", "NA", "bc", "NA"]] 1293 ); 1294 testSummarizer(["unittest-mis-9", "-H", "--group-by", "1,2", "--values", "3,1", "-x", file1Path], 1295 file1, 1296 [["fld1", "fld2", "fld3_values", "fld1_values"], 1297 ["a", "a", "3", "a"], 1298 ["c", "a", "2b", "c"], 1299 ["c", "bc", "3", "c|c"], 1300 ["a", "c", "2b", "a"], 1301 ["", "bc", "", ""]] 1302 ); 1303 testSummarizer(["unittest-mis-10", "-H", "--group-by", "3,2", "--values", "1", "-x", file1Path], 1304 file1, 1305 [["fld3", "fld2", "fld1_values"], 1306 ["3", "a", "a"], 1307 ["2b", "a", "c"], 1308 ["", "bc", "c"], 1309 ["2b", "c", "a"], 1310 ["3", "bc", "c"]] 1311 ); 1312 testSummarizer(["unittest-mis-11", "-H", "--group-by", "2,1,3", "--values", "2", "-x", file1Path], 1313 file1, 1314 [["fld2", "fld1", "fld3", "fld2_values"], 1315 ["a", "a", "3", "a"], 1316 ["a", "c", "2b", "a"], 1317 ["bc", "c", "", "bc"], 1318 ["c", "a", "2b", "c"], 1319 ["bc", "", "", "bc"], 1320 ["bc", "c", "3", "bc"]] 1321 ); 1322 testSummarizer(["unittest-mis-12", "-H", "--group-by", "1,2", "--values", "3,1", "-r", "NA", file1Path], 1323 file1, 1324 [["fld1", "fld2", "fld3_values", "fld1_values"], 1325 ["a", "a", "3", "a"], 1326 ["c", "a", "2b", "c"], 1327 ["c", "bc", "NA|3", "c|c"], 1328 ["a", "c", "2b", "a"], 1329 ["", "bc", "NA", "NA"]] 1330 ); 1331 testSummarizer(["unittest-mis-13", "-H", "--group-by", "3,2", "--values", "1", "-r", "NA", file1Path], 1332 file1, 1333 [["fld3", "fld2", "fld1_values"], 1334 ["3", "a", "a"], 1335 ["2b", "a", "c"], 1336 ["", "bc", "c|NA"], 1337 ["2b", "c", "a"], 1338 ["3", "bc", "c"]] 1339 ); 1340 testSummarizer(["unittest-mis-14", "-H", "--group-by", "2,1,3", "--values", "2", "-r", "NA", file1Path], 1341 file1, 1342 [["fld2", "fld1", "fld3", "fld2_values"], 1343 ["a", "a", "3", "a"], 1344 ["a", "c", "2b", "a"], 1345 ["bc", "c", "", "bc"], 1346 ["c", "a", "2b", "c"], 1347 ["bc", "", "", "bc"], 1348 ["bc", "c", "3", "bc"]] 1349 ); 1350 1351 /* Validate that the no-key summarizer works with testSummarizer helper function. 1352 */ 1353 testSummarizer(["unittest-nk-1", "-H", "--values", "1,2", file1Path], 1354 file1, 1355 [["fld1_values", "fld2_values"], 1356 ["a|c|c|a||c", "a|a|bc|c|bc|bc"]] 1357 ); 1358 1359 /* Header variations: no header line; auto-generated header line; custom headers. 1360 */ 1361 testSummarizer(["unittest-hdr-1", "--group-by", "1", "--values", "1", file1NoHeaderPath], 1362 file1[1..$], 1363 [["a", "a|a"], 1364 ["c", "c|c|c"], 1365 ["", ""]] 1366 ); 1367 testSummarizer(["unittest-hdr-2", "--group-by", "1,2", "--values", "2", file1NoHeaderPath], 1368 file1[1..$], 1369 [["a", "a", "a"], 1370 ["c", "a", "a"], 1371 ["c", "bc", "bc|bc"], 1372 ["a", "c", "c"], 1373 ["", "bc", "bc"]] 1374 ); 1375 testSummarizer(["unittest-hdr-3", "--write-header", "--group-by", "2", "--values", "1", file1NoHeaderPath], 1376 file1[1..$], 1377 [["field2", "field1_values"], 1378 ["a", "a|c"], 1379 ["bc", "c||c"], 1380 ["c", "a"]] 1381 ); 1382 testSummarizer(["unittest-hdr-4", "-w", "--group-by", "3,2", "--values", "1", file1NoHeaderPath], 1383 file1[1..$], 1384 [["field3", "field2", "field1_values"], 1385 ["3", "a", "a"], 1386 ["2b", "a", "c"], 1387 ["", "bc", "c|"], 1388 ["2b", "c", "a"], 1389 ["3", "bc", "c"]] 1390 ); 1391 testSummarizer(["unittest-hdr-5", "-H", "--group-by", "2", "--values", "3:Field3Values", file1Path], 1392 file1, 1393 [["fld2", "Field3Values"], 1394 ["a", "3|2b"], 1395 ["bc", "||3"], 1396 ["c", "2b"]] 1397 ); 1398 testSummarizer(["unittest-hdr-6", "-H", "--group-by", "1,2", "--values", "3:FieldThreeValues", "--values", "1:FieldOneValues", file1Path], 1399 file1, 1400 [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"], 1401 ["a", "a", "3", "a"], 1402 ["c", "a", "2b", "c"], 1403 ["c", "bc", "|3", "c|c"], 1404 ["a", "c", "2b", "a"], 1405 ["", "bc", "", ""]] 1406 ); 1407 testSummarizer(["unittest-hdr-7", "--write-header", "--group-by", "1", "--values", "3:f3_vals","--values", "2:f2_vals", "--values", "1:f1_vals", file1NoHeaderPath], 1408 file1[1..$], 1409 [["field1", "f3_vals", "f2_vals", "f1_vals"], 1410 ["a", "3|2b", "a|c", "a|a"], 1411 ["c", "2b||3", "a|bc|bc", "c|c|c"], 1412 ["", "", "bc", ""]] 1413 ); 1414 testSummarizer(["unittest-hdr-8", "--write-header", "--group-by", "1,3,2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2", file1NoHeaderPath], 1415 file1[1..$], 1416 [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"], 1417 ["a", "3", "a", "3", "a", "a"], 1418 ["c", "2b", "a", "2b", "c", "a"], 1419 ["c", "", "bc", "", "c", "bc"], 1420 ["a", "2b", "c", "2b", "a", "c"], 1421 ["", "", "bc", "", "", "bc"], 1422 ["c", "3", "bc", "3", "c", "bc"]] 1423 ); 1424 testSummarizer(["unittest-hdr-9", "--write-header", "--group-by", "1,3-2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2", file1NoHeaderPath], 1425 file1[1..$], 1426 [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"], 1427 ["a", "3", "a", "3", "a", "a"], 1428 ["c", "2b", "a", "2b", "c", "a"], 1429 ["c", "", "bc", "", "c", "bc"], 1430 ["a", "2b", "c", "2b", "a", "c"], 1431 ["", "", "bc", "", "", "bc"], 1432 ["c", "3", "bc", "3", "c", "bc"]] 1433 ); 1434 1435 /* Alternate file widths and lengths. 1436 */ 1437 1438 auto file3x2 = [["fld1", "fld2", "fld3"], 1439 ["a", "b", "c"], 1440 ["c", "b", "a"]]; 1441 1442 auto file3x2Path = buildPath(testDir, "file3x2.tsv"); 1443 auto file3x2NoHeaderPath = buildPath(testDir, "file3x2_noheader.tsv"); 1444 writeDataFile(file3x2Path, file3x2); 1445 writeDataFile(file3x2NoHeaderPath, file3x2[1 .. $]); 1446 1447 testSummarizer(["unittest-3x2-1", "-H", "--group-by", "1", "--values", "3", file3x2Path], 1448 file3x2, 1449 [["fld1", "fld3_values"], 1450 ["a", "c"], 1451 ["c", "a"]] 1452 ); 1453 testSummarizer(["unittest-3x2-2", "-H", "--group-by", "2", "--values", "3", file3x2Path], 1454 file3x2, 1455 [["fld2", "fld3_values"], 1456 ["b", "c|a"]] 1457 ); 1458 testSummarizer(["unittest-3x2-3", "-H", "--group-by", "2,1", "--values", "3", file3x2Path], 1459 file3x2, 1460 [["fld2", "fld1", "fld3_values"], 1461 ["b", "a", "c"], 1462 ["b", "c", "a"]] 1463 ); 1464 1465 auto file3x1 = [["fld1", "fld2", "fld3"], 1466 ["a", "b", "c"]]; 1467 1468 auto file3x1Path = buildPath(testDir, "file3x1.tsv"); 1469 auto file3x1NoHeaderPath = buildPath(testDir, "file3x1_noheader.tsv"); 1470 writeDataFile(file3x1Path, file3x1); 1471 writeDataFile(file3x1NoHeaderPath, file3x1[1 .. $]); 1472 1473 testSummarizer(["unittest-3x1-1", "-H", "--group-by", "1", "--values", "3", file3x1Path], 1474 file3x1, 1475 [["fld1", "fld3_values"], 1476 ["a", "c"]] 1477 ); 1478 testSummarizer(["unittest-3x1-2", "--group-by", "1", "--values", "3", file3x1NoHeaderPath], 1479 file3x1[1..$], 1480 [["a", "c"]] 1481 ); 1482 testSummarizer(["unittest-3x1-3", "-H", "--group-by", "2,1", "--values", "3", file3x1Path], 1483 file3x1, 1484 [["fld2", "fld1", "fld3_values"], 1485 ["b", "a", "c"]] 1486 ); 1487 testSummarizer(["unittest-3x1-4", "--group-by", "2,1", "--values", "3", file3x1NoHeaderPath], 1488 file3x1[1..$], 1489 [["b", "a", "c"]] 1490 ); 1491 1492 auto file3x0 = [["fld1", "fld2", "fld3"]]; 1493 1494 auto file3x0Path = buildPath(testDir, "file3x0.tsv"); 1495 auto file3x0NoHeaderPath = buildPath(testDir, "file3x0_noheader.tsv"); 1496 writeDataFile(file3x0Path, file3x0); 1497 writeDataFile(file3x0NoHeaderPath, file3x0[1 .. $]); 1498 1499 1500 testSummarizer(["unittest-3x0-1", "-H", "--group-by", "1", "--values", "3", file3x0Path], 1501 file3x0, 1502 [["fld1", "fld3_values"]] 1503 ); 1504 testSummarizer(["unittest-3x0-2", "--group-by", "1", "--values", "3", file3x0NoHeaderPath], 1505 file3x0[1..$], 1506 [] 1507 ); 1508 testSummarizer(["unittest-3x0-3", "--write-header", "--group-by", "1", "--values", "3", file3x0NoHeaderPath], 1509 file3x0[1..$], 1510 [["field1", "field3_values"]] 1511 ); 1512 1513 1514 testSummarizer(["unittest-3x0-4", "-H", "--group-by", "2,1", "--values", "3", file3x0Path], 1515 file3x0, 1516 [["fld2", "fld1", "fld3_values"]] 1517 ); 1518 1519 testSummarizer(["unittest-3x0-5", "--group-by", "2,1", "--values", "3", file3x0NoHeaderPath], 1520 file3x0[1..$], 1521 [] 1522 ); 1523 1524 testSummarizer(["unittest-3x0-6", "--write-header", "--group-by", "2,1", "--values", "3", file3x0NoHeaderPath], 1525 file3x0[1..$], 1526 [["field2", "field1", "field3_values"]] 1527 ); 1528 1529 auto file2x1 = [["fld1", "fld2"], 1530 ["a", "b"]]; 1531 1532 auto file2x1Path = buildPath(testDir, "file2x1.tsv"); 1533 auto file2x1NoHeaderPath = buildPath(testDir, "file2x1_noheader.tsv"); 1534 writeDataFile(file2x1Path, file2x1); 1535 writeDataFile(file2x1NoHeaderPath, file2x1[1 .. $]); 1536 1537 testSummarizer(["unittest-2x1-1", "-H", "--group-by", "1", "--values", "2", file2x1Path], 1538 file2x1, 1539 [["fld1", "fld2_values"], 1540 ["a", "b"]] 1541 ); 1542 testSummarizer(["unittest-2x1-2", "-H", "--group-by", "2,1", "--values", "1", file2x1Path], 1543 file2x1, 1544 [["fld2", "fld1", "fld1_values"], 1545 ["b", "a", "a"]] 1546 ); 1547 1548 auto file2x0 = [["fld1", "fld2"]]; 1549 1550 auto file2x0Path = buildPath(testDir, "file2x0.tsv"); 1551 auto file2x0NoHeaderPath = buildPath(testDir, "file2x0_noheader.tsv"); 1552 writeDataFile(file2x0Path, file2x0); 1553 writeDataFile(file2x0NoHeaderPath, file2x0[1 .. $]); 1554 1555 testSummarizer(["unittest-2x0-1", "-H", "--group-by", "1", "--values", "2", file2x0Path], 1556 file2x0, 1557 [["fld1", "fld2_values"]] 1558 ); 1559 testSummarizer(["unittest-2x0-2", "-H", "--group-by", "2,1", "--values", "1", file2x0Path], 1560 file2x0, 1561 [["fld2", "fld1", "fld1_values"]] 1562 ); 1563 1564 auto file1x2 = [["fld1"], 1565 ["a"], 1566 [""]]; 1567 1568 auto file1x2Path = buildPath(testDir, "file1x2.tsv"); 1569 auto file1x2NoHeaderPath = buildPath(testDir, "file1x2_noheader.tsv"); 1570 writeDataFile(file1x2Path, file1x2); 1571 writeDataFile(file1x2NoHeaderPath, file1x2[1 .. $]); 1572 1573 testSummarizer(["unittest-1x2-1", "-H", "--group-by", "1", "--values", "1", file1x2Path], 1574 file1x2, 1575 [["fld1", "fld1_values"], 1576 ["a", "a"], 1577 ["", ""]] 1578 ); 1579 1580 auto file1x2b = [["fld1"], 1581 [""], 1582 [""]]; 1583 1584 auto file1x2bPath = buildPath(testDir, "file1x2b.tsv"); 1585 auto file1x2bNoHeaderPath = buildPath(testDir, "file1x2b_noheader.tsv"); 1586 writeDataFile(file1x2bPath, file1x2b); 1587 writeDataFile(file1x2bNoHeaderPath, file1x2b[1 .. $]); 1588 1589 testSummarizer(["unittest-1x2b-2", "-H", "--group-by", "1", "--values", "1", file1x2bPath], 1590 file1x2b, 1591 [["fld1", "fld1_values"], 1592 ["", "|"]] 1593 ); 1594 1595 auto file1x1 = [["fld1"], 1596 ["x"]]; 1597 1598 auto file1x1Path = buildPath(testDir, "file1x1.tsv"); 1599 auto file1x1NoHeaderPath = buildPath(testDir, "file1x1_noheader.tsv"); 1600 writeDataFile(file1x1Path, file1x1); 1601 writeDataFile(file1x1NoHeaderPath, file1x1[1 .. $]); 1602 1603 testSummarizer(["unittest-1x1-1", "-H", "--group-by", "1", "--values", "1", file1x1Path], 1604 file1x1, 1605 [["fld1", "fld1_values"], 1606 ["x", "x"]] 1607 ); 1608 1609 testSummarizer(["unittest-1x1-2", "--group-by", "1", "--values", "1", file1x1NoHeaderPath], 1610 file1x1[1..$], 1611 [["x", "x"]] 1612 ); 1613 1614 testSummarizer(["unittest-1x1-3", "--write-header", "--group-by", "1", "--values", "1", file1x1NoHeaderPath], 1615 file1x1[1..$], 1616 [["field1", "field1_values"], 1617 ["x", "x"]] 1618 ); 1619 1620 auto file1x1b = [["fld1"], 1621 [""]]; 1622 1623 auto file1x1bPath = buildPath(testDir, "file1x1b.tsv"); 1624 auto file1x1bNoHeaderPath = buildPath(testDir, "file1x1b_noheader.tsv"); 1625 writeDataFile(file1x1bPath, file1x1b); 1626 writeDataFile(file1x1bNoHeaderPath, file1x1b[1 .. $]); 1627 1628 testSummarizer(["unittest-1x1b-1", "-H", "--group-by", "1", "--values", "1", file1x1bPath], 1629 file1x1b, 1630 [["fld1", "fld1_values"], 1631 ["", ""]] 1632 ); 1633 1634 auto file1x0 = [["fld1"]]; 1635 1636 auto file1x0Path = buildPath(testDir, "file1x0.tsv"); 1637 auto file1x0NoHeaderPath = buildPath(testDir, "file1x0_noheader.tsv"); 1638 writeDataFile(file1x0Path, file1x0); 1639 writeDataFile(file1x0NoHeaderPath, file1x0[1 .. $]); 1640 1641 testSummarizer(["unittest-1x0-1", "-H", "--group-by", "1", "--values", "1", file1x0Path], 1642 file1x0, 1643 [["fld1", "fld1_values"]] 1644 ); 1645 1646 testSummarizer(["unittest-1x0-2", "--group-by", "1", "--values", "1", file1x0NoHeaderPath], 1647 file1x0[1..$], 1648 [] 1649 ); 1650 1651 testSummarizer(["unittest-1x0-3", "--write-header", "--group-by", "1", "--values", "1", file1x0NoHeaderPath], 1652 file1x0[1..$], 1653 [["field1", "field1_values"]] 1654 ); 1655 1656 /* Alternate delimiters. */ 1657 testSummarizer(["unittest-delim-1", "-H", "--values", "1,2", "--delimiter", "%", file1Path], 1658 file1, 1659 [["fld1_values", "fld2_values"], 1660 ["a|c|c|a||c", "a|a|bc|c|bc|bc"]] 1661 ); 1662 testSummarizer(["unittest-delim-2", "-H", "--values", "1-2", "--values-delimiter", "$", file1Path], 1663 file1, 1664 [["fld1_values", "fld2_values"], 1665 ["a$c$c$a$$c", "a$a$bc$c$bc$bc"]] 1666 ); 1667 testSummarizer(["unittest-delim-3", "-H", "--values", "1,2", "--delimiter", "#", "--values-delimiter", ",", file1Path], 1668 file1, 1669 [["fld1_values", "fld2_values"], 1670 ["a,c,c,a,,c", "a,a,bc,c,bc,bc"]] 1671 ); 1672 testSummarizer(["unittest-delim-4", "--write-header", "--group-by", "2", "--values", "1", 1673 "--delimiter", "^", "--values-delimiter", ":", file1NoHeaderPath], 1674 file1[1..$], 1675 [["field2", "field1_values"], 1676 ["a", "a:c"], 1677 ["bc", "c::c"], 1678 ["c", "a"]] 1679 ); 1680 testSummarizer(["unittest-delim-5", "--group-by", "1,2", "--values", "2", "--delimiter", "/", 1681 "--values-delimiter", "\\", file1NoHeaderPath], 1682 file1[1..$], 1683 [["a", "a", "a"], 1684 ["c", "a", "a"], 1685 ["c", "bc", "bc\\bc"], 1686 ["a", "c", "c"], 1687 ["", "bc", "bc"]] 1688 ); 1689 } 1690 1691 /* Summary Operators and Calculators 1692 * 1693 * Two types of objects are used in implementation: Operators and Calculators. An Operator 1694 * represents a summary calculation specified on the command line, e.g. '--mean 5'. A 1695 * Calculator is used to manage the summary calculation for each unique key in the input. 1696 * 1697 * As an example, consider the command: 1698 * 1699 * $tsv-summarize --group-by 1 --mean 3 --mean 5 1700 * 1701 * This command will create two instances of a MeanOperator, one each for fields 3 and 5. 1702 * They produce the output field headers (e.g. "field3_mean", "field5_mean"). They also 1703 * create MeanCalculator objects for each unique value in field 1. For 'mean', a 1704 * calculator needs to track occurrence count and sum. Calculators produce the final 1705 * value when all processing is finished. 1706 * 1707 * Summary field headers 1708 * 1709 * There are several options for specifying summary field headers. The defaults combine the 1710 * operator name and the header of the field summarized. The defaults can be overridden on 1711 * on the command line. These scenarios are supported via the operator constructor and the 1712 * processHeaderLine() method. 1713 * 1714 * Missing field policy 1715 * 1716 * At present, tsv-summarize has a single policy for handling missing values that applies 1717 * to all operators. However, it is logically operator specific and is implemented that 1718 * way. The MissingFieldPolicy struct describes the policy, each operator contains one. 1719 * Calculators access thier operator's policy struct. 1720 */ 1721 1722 /** An Operator represents a summary calculation specified on the command line. 1723 * e.g. '--mean 5'. 1724 */ 1725 interface Operator 1726 { 1727 @property string header(); 1728 @property string name(); 1729 void processHeaderLine(const char[][] fields); 1730 size_t[] numericFieldsToSave(); // Numeric fields this Operator needs saved 1731 size_t[] textFieldsToSave(); // Text fields this Operator needs saved 1732 Calculator makeCalculator(); 1733 } 1734 1735 /** Calculators are responsible for the calculation of a single computation. They 1736 * process each line and produce the final value when all processing is finished. 1737 */ 1738 interface Calculator 1739 { 1740 void processNextLine(const char[][] fields); 1741 string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions); 1742 } 1743 1744 /** This class describes processing behavior when a missing value is encountered. 1745 */ 1746 final class MissingFieldPolicy 1747 { 1748 private bool _useMissing = true; // True if missing values are processed unchanged. 1749 private bool _replaceMissing = false; // True if missing values are replaced. 1750 private string _missingReplacement; // Replacement string if replaceMissing is true. 1751 1752 this (const bool excludeMissing = false, string missingReplacement = "") 1753 { 1754 updatePolicy(excludeMissing, missingReplacement); 1755 } 1756 1757 void updatePolicy(const bool excludeMissing, string missingReplacement) 1758 { 1759 _missingReplacement = missingReplacement; 1760 _replaceMissing = missingReplacement.length != 0; 1761 _useMissing = !excludeMissing && !replaceMissing; 1762 } 1763 1764 final bool isMissingField(const char[] field) const 1765 { 1766 return field.length == 0; 1767 } 1768 1769 final bool useMissing() const @property 1770 { 1771 return _useMissing; 1772 } 1773 1774 final bool excludeMissing() const @property 1775 { 1776 return !_useMissing && !_replaceMissing; 1777 } 1778 1779 final bool replaceMissing() const @property 1780 { 1781 return _replaceMissing; 1782 } 1783 1784 final string missingReplacement() const @property 1785 { 1786 return _missingReplacement; 1787 } 1788 } 1789 1790 /* The SharedFieldValues and UniqueKeyValuesLists classes manage lists of values collected 1791 * while reading data. Operations like median collect all values and operate on them when 1792 * running the final calculation. Value lists are needed for each unique key. A command 1793 * using multiple Operators may save multiple fields. And, different Operators may be run 1794 * against the same field. 1795 * 1796 * The last part motivates these classes. Handling large data sets necessitates minimizing 1797 * in-memory storage, making it desirable to share identical lists between Calculators. 1798 * Otherwise, each Calculator could implement its own storage, which would be simpler. 1799 * 1800 * The setup works as follows: 1801 * - Operators advertise fields they need saved ([text|numeric]FieldsToSave methods). 1802 * - The SummarizerBase object keeps a SharedFieldValues object, which in turn keeps list 1803 * of the fields advertised by Operators as needing sharing. This list gets created 1804 * during command initialization (SummarizerBase.setOperators). 1805 * - The SharedFieldValues object is used to create a UniqueKeyValuesLists object every 1806 * time a new unique key is found, in parellel to the Calculator objects created for the 1807 * key. The UniqueKeyValuesLists objects are managed by derived Summarizer classes. 1808 * - A unique key's UniqueKeyValuesLists object is passed each input line, same as 1809 * Calculators, saving the values. 1810 * - Calculators retrieve the saved values during the calculation phase. The calculator's 1811 * ProcessNextField method is typically a no-op. 1812 * - Calculators cannot make assumptions about the order of the saved values. This is 1813 * pragmatic concession to median and quantile calculations, which need to sort the data, 1814 * at least partially. Rather than generate sorted copies, the current algorithms 1815 * sort the data in place. 1816 * 1817 * One concession to duplicate storage is that text and numeric versions of the same 1818 * field might be stored. The reason is because it's important to convert text to numbers 1819 * as they are read so that useful error messages can be generated. And, storing both 1820 * forms of the same field should be less common. 1821 * 1822 * The current implementation uses the same missing values policy for all fields. If 1823 * multiple policies become supported this will need to change. 1824 * 1825 * Built-in calculations - UniqueKeyValueLists have a built-in median operation. This is 1826 * to avoid repeated calculations of the median by different calculations. 1827 */ 1828 1829 final class SharedFieldValues 1830 { 1831 // Arrays with field indices that need to be saved. 1832 private size_t[] _numericFieldIndices; 1833 private size_t[] _textFieldIndices; 1834 1835 /* Called during summarizer setup to add a shared field value for a specific field index. 1836 * eg. '--median 7' will add end up calling addNumericIdex(6), 6 being the zero-based index. 1837 * A specific index is only added once. 1838 */ 1839 final void addNumericIndex (size_t index) 1840 { 1841 if (!canFind(_numericFieldIndices, index)) _numericFieldIndices ~= index; 1842 } 1843 1844 /* Similar to addNumericIndex, except adds a text index. */ 1845 final void addTextIndex (size_t index) 1846 { 1847 if (!canFind(_textFieldIndices, index)) _textFieldIndices ~= index; 1848 } 1849 1850 /* Called every time a new key is found, or once at the beginning of the program if no keys 1851 * are being used (entire column summarized). 1852 */ 1853 final UniqueKeyValuesLists makeUniqueKeyValuesLists() 1854 { 1855 return new UniqueKeyValuesLists(_numericFieldIndices, _textFieldIndices); 1856 } 1857 } 1858 1859 final class UniqueKeyValuesLists 1860 { 1861 /* A FieldValues object holds is a list of values collect for a specific field. A 1862 * unique key may hold several. For example, the command: 1863 * $ tsv-summarize --k 1 --median 4 -- median 5 1864 * requires keeping lists for both fields 4 and 5. This in turn will result in a 1865 * _numericFieldValues being a 2 element array, one with a list of field 4 values, 1866 * the second of field 5 values. Linear search is used to find a specific field. 1867 */ 1868 private FieldValues!double[] _numericFieldValues; 1869 private FieldValues!string[] _textFieldValues; 1870 private double[] _numericFieldMedians; 1871 1872 /* The UniqueKeyValuesLists constructor takes arrays of field indices to be saved. */ 1873 this(const size_t[] numericFieldIndices, const size_t[] textFieldIndices) 1874 { 1875 if (numericFieldIndices.length > 0) 1876 { 1877 _numericFieldValues = new FieldValues!double[](numericFieldIndices.length); 1878 foreach (i, fieldIndex; numericFieldIndices) 1879 _numericFieldValues[i] = new FieldValues!double(fieldIndex); 1880 } 1881 1882 if (textFieldIndices.length > 0) 1883 { 1884 _textFieldValues = new FieldValues!string[](textFieldIndices.length); 1885 foreach (i, fieldIndex; textFieldIndices) 1886 _textFieldValues[i] = new FieldValues!string(fieldIndex); 1887 } 1888 } 1889 1890 void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy) 1891 { 1892 _numericFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy)); 1893 _textFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy)); 1894 } 1895 1896 private FieldValues!double findNumericFieldValues(size_t index) 1897 { 1898 alias pred = (FieldValues!double a, size_t b) => (a.fieldIndex == b); 1899 auto r = find!pred(_numericFieldValues, index); 1900 assert(!r.empty); 1901 return r.front; 1902 } 1903 1904 private FieldValues!string findTextFieldValues(size_t index) 1905 { 1906 alias pred = (FieldValues!string a, size_t b) => (a.fieldIndex == b); 1907 auto r = find!pred(_textFieldValues, index); 1908 assert(!r.empty); 1909 return r.front; 1910 } 1911 1912 final double[] numericValues(size_t index) 1913 { 1914 return findNumericFieldValues(index).getArray; 1915 } 1916 1917 final double[] numericValuesSorted(size_t index) 1918 { 1919 return findNumericFieldValues(index).getSortedArray; 1920 } 1921 1922 final string[] textValues(size_t index) 1923 { 1924 return findTextFieldValues(index).getArray; 1925 } 1926 1927 final string[] textValuesSorted(size_t index) 1928 { 1929 return findTextFieldValues(index).getSortedArray; 1930 } 1931 1932 final double numericValuesMedian(size_t index) 1933 { 1934 return findNumericFieldValues(index).median; 1935 } 1936 1937 private final class FieldValues(ValueType) 1938 { 1939 import std.array : appender; 1940 private size_t _fieldIndex; 1941 private Appender!(ValueType[]) _values; 1942 private bool _haveMedian = false; 1943 private bool _isSorted = false; 1944 private ValueType _medianValue; 1945 1946 this(size_t fieldIndex) 1947 { 1948 _fieldIndex = fieldIndex; 1949 } 1950 1951 final size_t length() const @property 1952 { 1953 return _values.data.length; 1954 } 1955 1956 final size_t fieldIndex() const @property 1957 { 1958 return _fieldIndex; 1959 } 1960 1961 final void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy) 1962 { 1963 debug writefln("[%s]: %s", __FUNCTION__, fields.to!string); 1964 1965 const char[] field = fields[_fieldIndex]; 1966 if (missingPolicy.useMissing || !missingPolicy.isMissingField(field)) 1967 { 1968 _values.put(field.to!ValueType); 1969 _haveMedian = false; 1970 _isSorted = false; 1971 } 1972 else if (missingPolicy.replaceMissing) 1973 { 1974 _values.put(missingPolicy.missingReplacement.to!ValueType); 1975 _haveMedian = false; 1976 _isSorted = false; 1977 } 1978 } 1979 1980 /* Return an input range of the values. */ 1981 final auto values() 1982 { 1983 return _values.data; 1984 } 1985 1986 final ValueType[] getArray() 1987 { 1988 return _values.data; 1989 } 1990 1991 final ValueType[] getSortedArray() 1992 { 1993 if (!_isSorted) 1994 { 1995 import std.algorithm : sort; 1996 sort(_values.data); 1997 _isSorted = true; 1998 } 1999 return _values.data; 2000 } 2001 2002 final ValueType median() 2003 { 2004 if (!_haveMedian) 2005 { 2006 import tsv_utils.common.numerics : rangeMedian; 2007 _medianValue = _values.data.rangeMedian(); 2008 _haveMedian = true; 2009 } 2010 2011 return _medianValue; 2012 } 2013 } 2014 } 2015 2016 /** SingleFieldOperator is a base class for single field operators, the most common 2017 * Operator. Derived classes implement makeCalculator and the Calculator class it returns. 2018 */ 2019 class SingleFieldOperator : Operator 2020 { 2021 import std.typecons : Flag; 2022 2023 private string _name; 2024 private string _header; 2025 private size_t _fieldIndex; 2026 private bool _useHeaderSuffix; 2027 private bool _allowCustomHeader; 2028 private bool _hasCustomHeader = false; 2029 private size_t[] _numericFieldsToSave; 2030 private size_t[] _textFieldsToSave; 2031 private MissingFieldPolicy _missingPolicy; 2032 2033 this(string operatorName, size_t fieldIndex, MissingFieldPolicy missingPolicy, 2034 Flag!"useHeaderSuffix" useHeaderSuffix = Yes.useHeaderSuffix, 2035 Flag!"allowCustomHeader" allowCustomHeader = Yes.allowCustomHeader) 2036 { 2037 _name = operatorName; 2038 _fieldIndex = fieldIndex; 2039 _missingPolicy = missingPolicy; 2040 _useHeaderSuffix = useHeaderSuffix; 2041 _allowCustomHeader = allowCustomHeader; 2042 // Default header. May be overrridden by custom header or header line. 2043 _header = 2044 fieldHeaderFromIndex(fieldIndex) 2045 .summaryHeaderFromFieldHeader(_useHeaderSuffix ? operatorName : ""); 2046 } 2047 2048 void setCustomHeader (string customHeader) 2049 { 2050 assert(_allowCustomHeader); 2051 _header = customHeader; 2052 _hasCustomHeader = true; 2053 } 2054 2055 final string name() const @property 2056 { 2057 return _name; 2058 } 2059 2060 final bool allowCustomHeader() const @property 2061 { 2062 return _allowCustomHeader; 2063 } 2064 2065 /* saveFieldValues[Numeric|Text] are called by derived classes to indicate that field 2066 * that the field values should be saved. These should called during construction. 2067 */ 2068 final void setSaveFieldValuesNumeric() 2069 { 2070 _numericFieldsToSave ~= _fieldIndex; 2071 } 2072 2073 final void setSaveFieldValuesText() 2074 { 2075 _textFieldsToSave ~= _fieldIndex; 2076 } 2077 2078 final MissingFieldPolicy missingPolicy() @property 2079 { 2080 return _missingPolicy; 2081 } 2082 2083 final size_t fieldIndex() const @property 2084 { 2085 return _fieldIndex; 2086 } 2087 2088 final string header() const @property 2089 { 2090 return _header; 2091 } 2092 2093 final bool useHeaderSuffix() const @property 2094 { 2095 return _useHeaderSuffix; 2096 } 2097 2098 void processHeaderLine(const char[][] fields) 2099 { 2100 if (!_hasCustomHeader) { 2101 debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string); 2102 _header = summaryHeaderFromFieldHeader(fields[_fieldIndex].to!string, 2103 _useHeaderSuffix ? _name : ""); 2104 } 2105 } 2106 2107 final size_t[] numericFieldsToSave() 2108 { 2109 return _numericFieldsToSave; 2110 } 2111 2112 final size_t[] textFieldsToSave() 2113 { 2114 return _textFieldsToSave; 2115 } 2116 2117 abstract SingleFieldCalculator makeCalculator(); 2118 } 2119 2120 /** SingleFieldCalculator is a base class for the common case of calculators using a single 2121 * field. Derived classes implement processNextField() rather than processNextLine(). 2122 */ 2123 class SingleFieldCalculator : Calculator 2124 { 2125 private size_t _fieldIndex; 2126 2127 this(size_t fieldIndex) 2128 { 2129 _fieldIndex = fieldIndex; 2130 } 2131 2132 final size_t fieldIndex() const @property 2133 { 2134 return _fieldIndex; 2135 } 2136 2137 final void processNextLine(const char[][] fields) 2138 { 2139 debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string); 2140 2141 auto missingPolicy = getOperator.missingPolicy; 2142 const char[] field = fields[_fieldIndex]; 2143 2144 if (missingPolicy.useMissing || !missingPolicy.isMissingField(field)) 2145 { 2146 processNextField(field); 2147 } 2148 else if (missingPolicy.replaceMissing) 2149 { 2150 processNextField(missingPolicy.missingReplacement); 2151 } 2152 } 2153 2154 abstract SingleFieldOperator getOperator(); 2155 2156 abstract void processNextField(const char[] field); 2157 } 2158 2159 /* Unittest helper functions. Only compiled when -unittest is in effect. */ 2160 version(unittest) 2161 { 2162 /** A helper for SingleFieldOperator unit tests. 2163 * 2164 * testSingleFieldOperator takes a set of split file values, a field index, a header 2165 * suffix, and a set of expected values. The expected values array contains the 2166 * initial value (zero entries) and the expected values after each line. (One more 2167 * expected value than input lines.) The zero entry case is what is generated for an 2168 * empty file. An example testing the 'min' operator against a file with 2 columns, 2169 * 3 rows, using field index 1: 2170 * 2171 * testSingleFieldOperator!MinOperator( 2172 * [["10", "100"], // The split file. 3 lines by 2 rows. 2173 * ["5", "50"], 2174 * ["20", "200"]], 2175 * 1, // Field index (zero-based, so "100", "50", "200") 2176 * "min", // The header suffix, normally the operator name. 2177 * ["nan", "100", "50", "50"]); // Min value after processing each line. 2178 * 2179 * A typical operator unit test uses three "files", one each of 1x3, 2x3, and 3x3. 2180 * Then run the operator is tested against each column, a total of six calls. Headers 2181 * are automatically checked. Additional entries can be used to extend coverage. 2182 * 2183 * A non-default MissingFieldPolicy can be provide as an optional last argument. 2184 * Operator tests should include exclusion and replacement variations. See operator 2185 * unit tests for details. 2186 * 2187 * The testSingleFieldOperatorBase adds an additional capability - Custom operator 2188 * init arguments. Currently this is used only by the quantile operator. 2189 * 2190 * These tests do not check unique key behavior (group-by). Operators don't have info 2191 * about unique keys, and interact with them only indirectly, via Calculators. 2192 */ 2193 void testSingleFieldOperator(OperatorClass : SingleFieldOperator) 2194 (const char[][][] splitFile, size_t fieldIndex, string headerSuffix, 2195 const char[][] expectedValues, 2196 MissingFieldPolicy missingPolicy = new MissingFieldPolicy) 2197 { 2198 testSingleFieldOperatorBase!OperatorClass(splitFile, fieldIndex, headerSuffix, expectedValues, missingPolicy); 2199 } 2200 2201 void testSingleFieldOperatorBase(OperatorClass : SingleFieldOperator, T...) 2202 (const char[][][] splitFile, size_t fieldIndex, string headerSuffix, 2203 const char[][] expectedValues, 2204 MissingFieldPolicy missingPolicy, 2205 T extraOpInitArgs) 2206 { 2207 import std.format : format; 2208 import std.array : appender; 2209 import std.string : chomp; 2210 import std.traits : EnumMembers; 2211 2212 auto numFields = (splitFile[0]).length; 2213 2214 assert(fieldIndex < numFields, 2215 format("[testSingleFieldOperator] Invalid field index. headerSuffix: %s", 2216 headerSuffix)); 2217 assert(splitFile.length + 1 == expectedValues.length, 2218 format("[testSingleFieldOperator] Need one more expected value than number of rows. headerSuffix: %s", 2219 headerSuffix)); 2220 2221 /* printOptions - Only the 'values-delimiter' (2nd arg) is used these tests. */ 2222 auto printOptions = SummarizerPrintOptions('#', '|'); 2223 2224 /* An input header line. */ 2225 string[] inputHeaderLine = new string[numFields]; 2226 foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string; 2227 2228 /* The different expected output field headers. */ 2229 auto outputFieldHeaderWithNoHeaderLine = 2230 fieldHeaderFromIndex(fieldIndex) 2231 .summaryHeaderFromFieldHeader(headerSuffix); 2232 auto outputFieldHeaderFromHeaderLine = 2233 inputHeaderLine[fieldIndex] 2234 .summaryHeaderFromFieldHeader(headerSuffix); 2235 auto customOutputFieldHeader = "custom"; 2236 2237 enum HeaderUsecase { 2238 HeaderLine_DefaultHeader, 2239 HeaderLine_CustomHeader, 2240 NoHeaderLine_DefaultHeader, 2241 NoHeaderLine_CustomHeader, 2242 NoHeaderLine_NoOutputHeader, 2243 } 2244 2245 string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected) 2246 { 2247 return format("[testSingleFieldOperator] Unexpected header. Operator: %s; Usecase: %s; Actual: '%s'; Expected: '%s'", 2248 op.name, hc, actual, expected); 2249 } 2250 2251 string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex, size_t fieldIndex, 2252 const char[] actual, const char[] expected) 2253 { 2254 return format("[testSingleFieldOperator] Unexpected value. Operator: %s; Usecase: %s; RowIndex: %d, FieldIndex: %d\n Actual: '%s'; Expected: '%s'", 2255 op.name, hc, rowIndex, fieldIndex, actual, expected); 2256 } 2257 2258 /* Run the logic for each header use case. */ 2259 foreach (hc; EnumMembers!HeaderUsecase) 2260 { 2261 bool hasInputHeader = ( 2262 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2263 hc == HeaderUsecase.HeaderLine_CustomHeader 2264 ); 2265 bool hasOutputHeader = ( 2266 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2267 hc == HeaderUsecase.HeaderLine_CustomHeader || 2268 hc == HeaderUsecase.NoHeaderLine_DefaultHeader || 2269 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2270 ); 2271 bool hasCustomHeader = ( 2272 hc == HeaderUsecase.HeaderLine_CustomHeader || 2273 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2274 ); 2275 2276 if (hasCustomHeader) assert(hasOutputHeader); 2277 2278 auto op = new OperatorClass(fieldIndex, missingPolicy, extraOpInitArgs); 2279 2280 if (hasCustomHeader) 2281 { 2282 if (!op.allowCustomHeader) continue; // Custom header not support by this operator 2283 op.setCustomHeader(customOutputFieldHeader); 2284 } 2285 2286 Operator[] operatorArray; 2287 operatorArray ~= op; 2288 2289 auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy); 2290 summarizer.setOperators(inputRangeObject(operatorArray)); 2291 2292 if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine); 2293 2294 if (hasOutputHeader) 2295 { 2296 /* Write the header line. Note that this is a one-field header, */ 2297 auto headerLineOutput = appender!(char[])(); 2298 summarizer.writeSummaryHeader(headerLineOutput, printOptions); 2299 2300 /* Test that the header was generated correctly. 2301 * 2302 * Note: Because the output is generated by a Summarizer, it will have a 2303 * trailing newline. Use chomp to trim it. 2304 */ 2305 final switch (hc) 2306 { 2307 case HeaderUsecase.HeaderLine_DefaultHeader: 2308 assert(headerLineOutput.data.chomp == outputFieldHeaderFromHeaderLine, 2309 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2310 outputFieldHeaderFromHeaderLine)); 2311 break; 2312 case HeaderUsecase.NoHeaderLine_DefaultHeader: 2313 assert(headerLineOutput.data.chomp == outputFieldHeaderWithNoHeaderLine, 2314 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2315 outputFieldHeaderWithNoHeaderLine)); 2316 break; 2317 case HeaderUsecase.HeaderLine_CustomHeader: 2318 case HeaderUsecase.NoHeaderLine_CustomHeader: 2319 assert(headerLineOutput.data.chomp == customOutputFieldHeader, 2320 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2321 customOutputFieldHeader)); 2322 break; 2323 case HeaderUsecase.NoHeaderLine_NoOutputHeader: 2324 break; 2325 } 2326 2327 } 2328 2329 /* For each line, process the line, generate the output, and test that the 2330 * value is correct. Start with the empty file case. 2331 */ 2332 foreach (i, const char[] expected; expectedValues) 2333 { 2334 if (i > 0) summarizer.processNextLine(splitFile[i - 1]); 2335 auto summaryLineOutput = appender!(char[])(); 2336 summarizer.writeSummaryBody(summaryLineOutput, printOptions); 2337 assert(summaryLineOutput.data.chomp == expected, 2338 valueAssertMessage(operatorArray[0], hc, i, fieldIndex, 2339 summaryLineOutput.data.chomp, expectedValues[i])); 2340 } 2341 } 2342 } 2343 } 2344 2345 /** ZeroFieldOperator is a base class for operators that take no input. The main use 2346 * case is the CountOperator, which counts the occurrences of each unique key. Other 2347 * uses are possible, for example, weighted random number assignment. 2348 * 2349 * The primary rationale for ZeroFieldOperator and ZeroFieldCalculator is to clarify 2350 * the information available to such a routine. In particular, the split fields passed 2351 * to processHeaderLine and processNextLine don't include all fields in the input, 2352 * something that might not be obvious when implementing an operator. (Only fields 2353 * required by operators acting on specific fields are included.) 2354 */ 2355 class ZeroFieldOperator : Operator 2356 { 2357 import std.typecons : Flag; 2358 2359 private string _name; 2360 private string _header; 2361 2362 this(string operatorName) 2363 { 2364 _name = operatorName; 2365 _header = operatorName; 2366 } 2367 2368 void setCustomHeader (string customHeader) 2369 { 2370 _header = customHeader; 2371 } 2372 2373 bool allowCustomHeader() const @property 2374 { 2375 return true; 2376 } 2377 2378 final string name() const @property 2379 { 2380 return _name; 2381 } 2382 2383 final string header() const @property 2384 { 2385 return _header; 2386 } 2387 2388 /* A no-op. ZeroFieldOperators have no access to the header line. */ 2389 final void processHeaderLine(const char[][] fields) { } 2390 2391 /* A no-op. ZeroFieldOperators have no access to fields. */ 2392 final size_t[] numericFieldsToSave() 2393 { 2394 size_t[] emptyArray; 2395 return emptyArray; 2396 } 2397 2398 /* A no-op. ZeroFieldOperators have no access to fields. */ 2399 final size_t[] textFieldsToSave() 2400 { 2401 size_t[] emptyArray; 2402 return emptyArray; 2403 } 2404 2405 abstract ZeroFieldCalculator makeCalculator(); 2406 } 2407 2408 /** ZeroFieldCalculator is a base class for operators that don't use fields as input. 2409 * In particular, the Count operator. It is a companion to the ZeroFieldOperator class. 2410 * 2411 * Derived classes implement processNextEntry() rather than processNextLine(), and the 2412 * single argument form of calculate() given as an abstract function. 2413 */ 2414 class ZeroFieldCalculator : Calculator 2415 { 2416 this() { } 2417 2418 final void processNextLine(const char[][] fields) 2419 { 2420 debug writefln("[%s]", __FUNCTION__,); 2421 processNextEntry(); 2422 } 2423 2424 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2425 { 2426 return calculate(printOptions); 2427 } 2428 2429 abstract void processNextEntry(); 2430 abstract string calculate(const ref SummarizerPrintOptions printOptions); 2431 } 2432 2433 version(unittest) 2434 { 2435 /* A helper for ZeroFieldOperator unit tests. 2436 * 2437 * testZeroFieldOperator takes a set of split file values, a default header, and a 2438 * set of expected values. The expected values array contains the expected values 2439 * after each line. 2440 * 2441 * testZeroFieldOperator is very similar to testSingleFieldOperator, except that 2442 * there is no use of field indices and fewer types of headers. See the latter's 2443 * documentation and the CountOperator unit tests for examples. 2444 */ 2445 void testZeroFieldOperator(OperatorClass : ZeroFieldOperator) 2446 (const char[][][] splitFile, string defaultHeader, const char[][] expectedValues) 2447 { 2448 import std.format : format; 2449 import std.array : appender; 2450 import std.string : chomp; 2451 import std.traits : EnumMembers; 2452 2453 auto numFields = (splitFile[0]).length; 2454 2455 assert(splitFile.length + 1 == expectedValues.length, 2456 format("[testZeroFieldOperator] Need one more expected value than number of rows. headerSuffix: %s", 2457 defaultHeader)); 2458 2459 /* printOptions - Not used these tests, but needed for API calls. */ 2460 auto printOptions = SummarizerPrintOptions('#', '|'); 2461 2462 /* Missing policy doesn't apply to zero field operators, but need the object for the summarizer. */ 2463 auto missingPolicy = new MissingFieldPolicy; 2464 2465 /* An input header line. */ 2466 string[] inputHeaderLine = new string[numFields]; 2467 foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string; 2468 2469 auto customOutputFieldHeader = "custom"; 2470 2471 enum HeaderUsecase { 2472 HeaderLine_DefaultHeader, 2473 HeaderLine_CustomHeader, 2474 NoHeaderLine_DefaultHeader, 2475 NoHeaderLine_CustomHeader, 2476 NoHeaderLine_NoOutputHeader, 2477 } 2478 2479 string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected) 2480 { 2481 return format("[testZeroFieldOperator] Unexpected header. Operator: %s; Usecase: %s; Actual: '%s'; Expected: '%s'", 2482 op.name, hc, actual, expected); 2483 } 2484 2485 string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex, 2486 const char[] actual, const char[] expected) 2487 { 2488 return format("[testZeroFieldOperator] Unexpected value. Operator: %s; Usecase: %s; RowIndex: %d\n Actual: '%s'; Expected: '%s'", 2489 op.name, hc, rowIndex, actual, expected); 2490 } 2491 2492 /* Run the logic for each header use case. */ 2493 foreach (hc; EnumMembers!HeaderUsecase) 2494 { 2495 bool hasInputHeader = ( 2496 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2497 hc == HeaderUsecase.HeaderLine_CustomHeader 2498 ); 2499 bool hasOutputHeader = ( 2500 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2501 hc == HeaderUsecase.HeaderLine_CustomHeader || 2502 hc == HeaderUsecase.NoHeaderLine_DefaultHeader || 2503 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2504 ); 2505 bool hasCustomHeader = ( 2506 hc == HeaderUsecase.HeaderLine_CustomHeader || 2507 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2508 ); 2509 2510 if (hasCustomHeader) assert(hasOutputHeader); 2511 2512 auto op = new OperatorClass(); 2513 2514 if (hasCustomHeader) 2515 { 2516 if (!op.allowCustomHeader) continue; // Custom header not support by this operator 2517 op.setCustomHeader(customOutputFieldHeader); 2518 } 2519 2520 Operator[] operatorArray; 2521 operatorArray ~= op; 2522 2523 auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy); 2524 summarizer.setOperators(inputRangeObject(operatorArray)); 2525 if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine); 2526 2527 if (hasOutputHeader) 2528 { 2529 /* Write the header line. Note that this is a one-field header, */ 2530 auto headerLineOutput = appender!(char[])(); 2531 summarizer.writeSummaryHeader(headerLineOutput, printOptions); 2532 2533 /* Test that the header was generated correctly. 2534 * 2535 * Note: Because the output is generated by a Summarizer, it will have a 2536 * trailing newline. Use chomp to trim it. 2537 */ 2538 final switch (hc) 2539 { 2540 case HeaderUsecase.HeaderLine_DefaultHeader: 2541 case HeaderUsecase.NoHeaderLine_DefaultHeader: 2542 assert(headerLineOutput.data.chomp == defaultHeader, 2543 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2544 defaultHeader)); 2545 break; 2546 case HeaderUsecase.HeaderLine_CustomHeader: 2547 case HeaderUsecase.NoHeaderLine_CustomHeader: 2548 assert(headerLineOutput.data.chomp == customOutputFieldHeader, 2549 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2550 customOutputFieldHeader)); 2551 break; 2552 case HeaderUsecase.NoHeaderLine_NoOutputHeader: 2553 break; 2554 } 2555 2556 } 2557 2558 /* For each line, process the line, generate the output, and test that the 2559 * value is correct. Start with the empty file case. 2560 */ 2561 foreach (i, const char[] expected; expectedValues) 2562 { 2563 if (i > 0) summarizer.processNextLine(splitFile[i - 1]); 2564 auto summaryLineOutput = appender!(char[])(); 2565 summarizer.writeSummaryBody(summaryLineOutput, printOptions); 2566 assert(summaryLineOutput.data.chomp == expected, 2567 valueAssertMessage(operatorArray[0], hc, i, 2568 summaryLineOutput.data.chomp, expectedValues[i])); 2569 } 2570 } 2571 } 2572 } 2573 2574 /* Specific operators. 2575 * 2576 * Notes: 2577 * - The 'Calculator' inner classes are 'static'. This means inner class instances do not 2578 * keep a reference to the context of the outer class. In exchange, Calculator instances 2579 * need to hold all needed state, typically the field index they are summarizing. 2580 */ 2581 2582 /** CountOperator counts the number of occurrences of each unique key, or the number of 2583 * input lines if there is no unique key. 2584 * 2585 * CountOperator differs from most other operators in that it doesn't summarize a specific 2586 * field on the line. Instead it is summarizing a property of the unique key itself. For 2587 * this reason it doesn't derive from SingleFieldOperator. 2588 */ 2589 final class CountOperator : ZeroFieldOperator 2590 { 2591 this() 2592 { 2593 super("count"); 2594 } 2595 2596 final override ZeroFieldCalculator makeCalculator() 2597 { 2598 return new CountCalculator(); 2599 } 2600 2601 static final class CountCalculator : ZeroFieldCalculator 2602 { 2603 private size_t _count = 0; 2604 2605 final override void processNextEntry() 2606 { 2607 _count++; 2608 } 2609 2610 final override string calculate(const ref SummarizerPrintOptions printOptions) 2611 { 2612 return printOptions.formatNumber(_count); 2613 } 2614 } 2615 } 2616 2617 unittest // CountOperator 2618 { 2619 auto col1File = [["10"], ["9.5"], ["11"]]; 2620 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 2621 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 2622 2623 testZeroFieldOperator!CountOperator(col1File, "count", ["0", "1", "2", "3"]); 2624 testZeroFieldOperator!CountOperator(col2File, "count", ["0", "1", "2", "3"]); 2625 testZeroFieldOperator!CountOperator(col3File, "count", ["0", "1", "2", "3"]); 2626 } 2627 2628 /** RetainOperator retains the first occurrence of a field, without changing the header. 2629 * 2630 * RetainOperator is intended for fields where the value is expected to be the same for 2631 * all occurrences of the unique key, and the goal is to pass the value through unchanged. 2632 * It is like FirstOperator, except that the original header is preserved. The original 2633 * header preservation is setup in the call to the SingleFieldOperation constructor. 2634 * 2635 * Notes: 2636 * - An option to signal an error if multiple values are encountered might be useful. 2637 */ 2638 final class RetainOperator : SingleFieldOperator 2639 { 2640 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2641 { 2642 super("retain", fieldIndex, missingPolicy, No.useHeaderSuffix, No.allowCustomHeader); 2643 } 2644 2645 final override SingleFieldCalculator makeCalculator() 2646 { 2647 return new RetainCalculator(fieldIndex); 2648 } 2649 2650 final class RetainCalculator : SingleFieldCalculator 2651 { 2652 private bool _done = false; 2653 private string _value = ""; 2654 2655 this(size_t fieldIndex) 2656 { 2657 super(fieldIndex); 2658 } 2659 2660 final override RetainOperator getOperator() 2661 { 2662 return this.outer; 2663 } 2664 2665 final override void processNextField(const char[] nextField) 2666 { 2667 if (!_done) 2668 { 2669 _value = nextField.to!string; 2670 _done = true; 2671 } 2672 } 2673 2674 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2675 { 2676 return _value; 2677 } 2678 } 2679 } 2680 2681 unittest // RetainOperator 2682 { 2683 auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]]; 2684 auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]]; 2685 auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]]; 2686 2687 testSingleFieldOperator!RetainOperator(col1File, 0, "", ["", "r1c1", "r1c1", "r1c1"]); 2688 testSingleFieldOperator!RetainOperator(col2File, 0, "", ["", "r1c1", "r1c1", "r1c1"]); 2689 testSingleFieldOperator!RetainOperator(col2File, 1, "", ["", "r1c2", "r1c2", "r1c2"]); 2690 testSingleFieldOperator!RetainOperator(col3File, 0, "", ["", "r1c1", "r1c1", "r1c1"]); 2691 testSingleFieldOperator!RetainOperator(col3File, 1, "", ["", "r1c2", "r1c2", "r1c2"]); 2692 testSingleFieldOperator!RetainOperator(col3File, 2, "", ["", "r1c3", "r1c3", "r1c3"]); 2693 2694 auto col1misFile = [[""], ["r2c1"], ["r3c1"]]; 2695 testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "", "r2c1", "r2c1"], 2696 new MissingFieldPolicy(true, "")); // Exclude missing 2697 testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "NA", "NA", "NA"], 2698 new MissingFieldPolicy(false, "NA")); // Replace missing 2699 } 2700 2701 /** FirstOperator outputs the first value found for the field. 2702 */ 2703 final class FirstOperator : SingleFieldOperator 2704 { 2705 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2706 { 2707 super("first", fieldIndex, missingPolicy); 2708 } 2709 2710 final override SingleFieldCalculator makeCalculator() 2711 { 2712 return new FirstCalculator(fieldIndex); 2713 } 2714 2715 final class FirstCalculator : SingleFieldCalculator 2716 { 2717 private bool _done = false; 2718 private string _value = ""; 2719 2720 this(size_t fieldIndex) 2721 { 2722 super(fieldIndex); 2723 } 2724 2725 final override FirstOperator getOperator() 2726 { 2727 return this.outer; 2728 } 2729 2730 final override void processNextField(const char[] nextField) 2731 { 2732 if (!_done) 2733 { 2734 _value = nextField.to!string; 2735 _done = true; 2736 } 2737 } 2738 2739 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2740 { 2741 return _value; 2742 } 2743 } 2744 } 2745 2746 unittest // FirstOperator 2747 { 2748 auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]]; 2749 auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]]; 2750 auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]]; 2751 2752 testSingleFieldOperator!FirstOperator(col1File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]); 2753 testSingleFieldOperator!FirstOperator(col2File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]); 2754 testSingleFieldOperator!FirstOperator(col2File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]); 2755 testSingleFieldOperator!FirstOperator(col3File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]); 2756 testSingleFieldOperator!FirstOperator(col3File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]); 2757 testSingleFieldOperator!FirstOperator(col3File, 2, "first", ["", "r1c3", "r1c3", "r1c3"]); 2758 2759 auto col1misFile = [[""], ["r2c1"], ["r3c1"]]; 2760 testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "", "r2c1", "r2c1"], 2761 new MissingFieldPolicy(true, "")); // Exclude missing 2762 testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "NA", "NA", "NA"], 2763 new MissingFieldPolicy(false, "NA")); // Replace missing 2764 } 2765 2766 /** LastOperator outputs the last value found for the field. 2767 */ 2768 final class LastOperator : SingleFieldOperator 2769 { 2770 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2771 { 2772 super("last", fieldIndex, missingPolicy); 2773 } 2774 2775 final override SingleFieldCalculator makeCalculator() 2776 { 2777 return new LastCalculator(fieldIndex); 2778 } 2779 2780 final class LastCalculator : SingleFieldCalculator 2781 { 2782 private string _value = ""; 2783 2784 this(size_t fieldIndex) 2785 { 2786 super(fieldIndex); 2787 } 2788 2789 final override LastOperator getOperator() 2790 { 2791 return this.outer; 2792 } 2793 2794 final override void processNextField(const char[] nextField) 2795 { 2796 _value = nextField.to!string; 2797 } 2798 2799 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2800 { 2801 return _value; 2802 } 2803 } 2804 } 2805 2806 unittest // LastOperator 2807 { 2808 auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]]; 2809 auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]]; 2810 auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]]; 2811 2812 testSingleFieldOperator!LastOperator(col1File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]); 2813 testSingleFieldOperator!LastOperator(col2File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]); 2814 testSingleFieldOperator!LastOperator(col2File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]); 2815 testSingleFieldOperator!LastOperator(col3File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]); 2816 testSingleFieldOperator!LastOperator(col3File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]); 2817 testSingleFieldOperator!LastOperator(col3File, 2, "last", ["", "r1c3", "r2c3", "r3c3"]); 2818 2819 auto col1misFile = [[""], ["r2c1"], ["r3c1"]]; 2820 testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "", "r2c1", "r3c1"], 2821 new MissingFieldPolicy(true, "")); // Exclude missing 2822 testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "NA", "r2c1", "r3c1"], 2823 new MissingFieldPolicy(false, "NA")); // Replace missing 2824 } 2825 2826 /** MinOperator output the minimum value for the field. This is a numeric operator. 2827 * 2828 * This operator returns the original string without additional numeric formatting. 2829 * This can be useful when joining back to the original data. This is different than 2830 * numeric operators that perform calculations. 2831 */ 2832 final class MinOperator : SingleFieldOperator 2833 { 2834 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2835 { 2836 super("min", fieldIndex, missingPolicy); 2837 } 2838 2839 final override SingleFieldCalculator makeCalculator() 2840 { 2841 return new MinCalculator(fieldIndex); 2842 } 2843 2844 final class MinCalculator : SingleFieldCalculator 2845 { 2846 private bool _isFirst = true; 2847 private double _value = double.nan; 2848 private string _originalString = "nan"; // Note: Cannot format floats at compile time (version 2.087) 2849 2850 this(size_t fieldIndex) 2851 { 2852 super(fieldIndex); 2853 } 2854 2855 final override MinOperator getOperator() 2856 { 2857 return this.outer; 2858 } 2859 2860 final override void processNextField(const char[] nextField) 2861 { 2862 double fieldValue = nextField.to!double; 2863 if (_isFirst) 2864 { 2865 _value = fieldValue; 2866 _originalString = nextField.to!string; 2867 _isFirst = false; 2868 } 2869 else if (fieldValue < _value) 2870 { 2871 _value = fieldValue; 2872 _originalString = nextField.to!string; 2873 } 2874 } 2875 2876 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2877 { 2878 return _originalString; 2879 } 2880 } 2881 } 2882 2883 unittest // MinOperator 2884 { 2885 auto col1File = [["10"], ["9.5"], ["11"]]; 2886 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 2887 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 2888 2889 testSingleFieldOperator!MinOperator(col1File, 0, "min", ["nan", "10", "9.5", "9.5"]); 2890 testSingleFieldOperator!MinOperator(col2File, 0, "min", ["nan", "20", "20", "20"]); 2891 testSingleFieldOperator!MinOperator(col2File, 1, "min", ["nan", "-30", "-30", "-31"]); 2892 testSingleFieldOperator!MinOperator(col3File, 0, "min", ["nan", "9009", "199", "199"]); 2893 testSingleFieldOperator!MinOperator(col3File, 1, "min", ["nan", "9", "0", "0"]); 2894 testSingleFieldOperator!MinOperator(col3File, 2, "min", ["nan", "-4.5", "-4.5", "-4.5"]); 2895 2896 auto col1misFile = [[""], ["10"], ["-10"]]; 2897 testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "nan", "10", "-10"], 2898 new MissingFieldPolicy(true, "")); // Exclude missing 2899 testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "5", "5", "-10"], 2900 new MissingFieldPolicy(false, "5")); // Replace missing 2901 } 2902 2903 /** MaxOperator output the maximum value for the field. This is a numeric operator. 2904 * 2905 * This operator returns the original string without additional numeric formatting. 2906 * This can be useful when joining back to the original data. This is different than 2907 * numeric operators that perform calculations. 2908 */ 2909 final class MaxOperator : SingleFieldOperator 2910 { 2911 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2912 { 2913 super("max", fieldIndex, missingPolicy); 2914 } 2915 2916 final override SingleFieldCalculator makeCalculator() 2917 { 2918 return new MaxCalculator(fieldIndex); 2919 } 2920 2921 final class MaxCalculator : SingleFieldCalculator 2922 { 2923 private bool _isFirst = true; 2924 private double _value = double.nan; 2925 private string _originalString = "nan"; // Note: Cannot format floats at compile time (version 2.087) 2926 2927 this(size_t fieldIndex) 2928 { 2929 super(fieldIndex); 2930 } 2931 2932 final override MaxOperator getOperator() 2933 { 2934 return this.outer; 2935 } 2936 2937 final override void processNextField(const char[] nextField) 2938 { 2939 double fieldValue = nextField.to!double; 2940 if (_isFirst) 2941 { 2942 _value = fieldValue; 2943 _originalString = nextField.to!string; 2944 _isFirst = false; 2945 } 2946 else if (fieldValue > _value) 2947 { 2948 _value = fieldValue; 2949 _originalString = nextField.to!string; 2950 } 2951 } 2952 2953 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2954 { 2955 return _originalString; 2956 } 2957 } 2958 } 2959 2960 unittest // MaxOperator 2961 { 2962 auto col1File = [["10"], ["9.5"], ["11"]]; 2963 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 2964 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 2965 2966 testSingleFieldOperator!MaxOperator(col1File, 0, "max", ["nan", "10", "10", "11"]); 2967 testSingleFieldOperator!MaxOperator(col2File, 0, "max", ["nan", "20", "21", "22"]); 2968 testSingleFieldOperator!MaxOperator(col2File, 1, "max", ["nan", "-30", "-29", "-29"]); 2969 testSingleFieldOperator!MaxOperator(col3File, 0, "max", ["nan", "9009", "9009", "9009"]); 2970 testSingleFieldOperator!MaxOperator(col3File, 1, "max", ["nan", "9", "9", "9"]); 2971 testSingleFieldOperator!MaxOperator(col3File, 2, "max", ["nan", "-4.5", "-0.5", "12"]); 2972 2973 auto col1misFile = [[""], ["-10"], ["10"]]; 2974 testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "nan", "-10", "10"], 2975 new MissingFieldPolicy(true, "")); // Exclude missing 2976 testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "5", "5", "10"], 2977 new MissingFieldPolicy(false, "5")); // Replace missing 2978 } 2979 2980 /** RangeOperator outputs the difference between the minimum and maximum values. 2981 * 2982 * If there is a single value, or all values are the same, the range is zero. This is 2983 * a numeric operator. 2984 */ 2985 final class RangeOperator : SingleFieldOperator 2986 { 2987 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2988 { 2989 super("range", fieldIndex, missingPolicy); 2990 } 2991 2992 final override SingleFieldCalculator makeCalculator() 2993 { 2994 return new RangeCalculator(fieldIndex); 2995 } 2996 2997 final class RangeCalculator : SingleFieldCalculator 2998 { 2999 private bool _isFirst = true; 3000 private double _minValue = 0.0; 3001 private double _maxValue = 0.0; 3002 3003 this(size_t fieldIndex) 3004 { 3005 super(fieldIndex); 3006 } 3007 3008 final override RangeOperator getOperator() 3009 { 3010 return this.outer; 3011 } 3012 3013 final override void processNextField(const char[] nextField) 3014 { 3015 double fieldValue = nextField.to!double; 3016 if (_isFirst) 3017 { 3018 _minValue = _maxValue = fieldValue; 3019 _isFirst = false; 3020 } 3021 else if (fieldValue > _maxValue) 3022 { 3023 _maxValue = fieldValue; 3024 } 3025 else if (fieldValue < _minValue) 3026 { 3027 _minValue = fieldValue; 3028 } 3029 } 3030 3031 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3032 { 3033 return printOptions.formatNumber(_maxValue - _minValue); 3034 } 3035 } 3036 } 3037 3038 unittest // RangeOperator 3039 { 3040 auto col1File = [["10"], ["9.5"], ["11"]]; 3041 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3042 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 3043 3044 testSingleFieldOperator!RangeOperator(col1File, 0, "range", ["0", "0", "0.5", "1.5"]); 3045 testSingleFieldOperator!RangeOperator(col2File, 0, "range", ["0", "0", "1", "2"]); 3046 testSingleFieldOperator!RangeOperator(col2File, 1, "range", ["0", "0", "1", "2"]); 3047 testSingleFieldOperator!RangeOperator(col3File, 0, "range", ["0", "0", "8810", "8810"]); 3048 testSingleFieldOperator!RangeOperator(col3File, 1, "range", ["0", "0", "9", "9"]); 3049 testSingleFieldOperator!RangeOperator(col3File, 2, "range", ["0", "0", "4", "16.5"]); 3050 3051 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]]; 3052 testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "0", "0", "0.5", "1.5"], 3053 new MissingFieldPolicy(true, "")); // Exclude missing 3054 testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "4.5", "4.5", "4.5", "5.5"], 3055 new MissingFieldPolicy(false, "5.5")); // Replace missing 3056 } 3057 3058 /** SumOperator produces the sum of all the values. This is a numeric operator. 3059 */ 3060 final class SumOperator : SingleFieldOperator 3061 { 3062 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3063 { 3064 super("sum", fieldIndex, missingPolicy); 3065 } 3066 3067 final override SingleFieldCalculator makeCalculator() 3068 { 3069 return new SumCalculator(fieldIndex); 3070 } 3071 3072 final class SumCalculator : SingleFieldCalculator 3073 { 3074 private double _total = 0.0; 3075 3076 this(size_t fieldIndex) 3077 { 3078 super(fieldIndex); 3079 } 3080 3081 final override SumOperator getOperator() 3082 { 3083 return this.outer; 3084 } 3085 3086 final override void processNextField(const char[] nextField) 3087 { 3088 _total += nextField.to!double; 3089 } 3090 3091 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3092 { 3093 return printOptions.formatNumber(_total); 3094 } 3095 } 3096 } 3097 3098 unittest // SumOperator 3099 { 3100 auto col1File = [["10"], ["9.5"], ["11"]]; 3101 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3102 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 3103 3104 testSingleFieldOperator!SumOperator(col1File, 0, "sum", ["0", "10", "19.5", "30.5"]); 3105 testSingleFieldOperator!SumOperator(col2File, 0, "sum", ["0", "20", "41", "63"]); 3106 testSingleFieldOperator!SumOperator(col2File, 1, "sum", ["0", "-30", "-59", "-90"]); 3107 testSingleFieldOperator!SumOperator(col3File, 0, "sum", ["0", "9009", "9208", "12211"]); 3108 testSingleFieldOperator!SumOperator(col3File, 1, "sum", ["0", "9", "9", "9.2"]); 3109 testSingleFieldOperator!SumOperator(col3File, 2, "sum", ["0", "-4.5", "-5", "7"]); 3110 3111 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]]; 3112 testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "0", "10", "10", "19.5", "30.5"], 3113 new MissingFieldPolicy(true, "")); // Exclude missing 3114 testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "1.5", "11.5", "13", "22.5", "33.5"], 3115 new MissingFieldPolicy(false, "1.5")); // Replace missing 3116 } 3117 3118 /** MeanOperator produces the mean (average) of all the values. This is a numeric operator. 3119 */ 3120 final class MeanOperator : SingleFieldOperator 3121 { 3122 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3123 { 3124 super("mean", fieldIndex, missingPolicy); 3125 } 3126 3127 final override SingleFieldCalculator makeCalculator() 3128 { 3129 return new MeanCalculator(fieldIndex); 3130 } 3131 3132 final class MeanCalculator : SingleFieldCalculator 3133 { 3134 private double _total = 0.0; 3135 private size_t _count = 0; 3136 3137 this(size_t fieldIndex) 3138 { 3139 super(fieldIndex); 3140 } 3141 3142 final override MeanOperator getOperator() 3143 { 3144 return this.outer; 3145 } 3146 3147 final override void processNextField(const char[] nextField) 3148 { 3149 _total += nextField.to!double; 3150 _count++; 3151 } 3152 3153 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3154 { 3155 return printOptions.formatNumber( 3156 (_count > 0) ? (_total / _count.to!double) : double.nan); 3157 } 3158 } 3159 } 3160 3161 unittest // MeanOperator 3162 { 3163 auto col1File = [["10"], ["9.5"], ["7.5"]]; 3164 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3165 auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]]; 3166 3167 testSingleFieldOperator!MeanOperator(col1File, 0, "mean", ["nan", "10", "9.75", "9"]); 3168 testSingleFieldOperator!MeanOperator(col2File, 0, "mean", ["nan", "20", "20.5", "21"]); 3169 testSingleFieldOperator!MeanOperator(col2File, 1, "mean", ["nan", "-30", "-29.5", "-30"]); 3170 testSingleFieldOperator!MeanOperator(col3File, 0, "mean", ["nan", "9009", "4509", "4509"]); 3171 testSingleFieldOperator!MeanOperator(col3File, 1, "mean", ["nan", "9", "4.5", "2"]); 3172 testSingleFieldOperator!MeanOperator(col3File, 2, "mean", ["nan", "-4.5", "-3", "2"]); 3173 3174 auto col1misFile = [[""], ["6"], [""], ["14"], ["40"]]; 3175 testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "nan", "6", "6", "10", "20"], 3176 new MissingFieldPolicy(true, "")); // Exclude missing 3177 testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "0", "3", "2", "5", "12"], 3178 new MissingFieldPolicy(false, "0")); // Replace missing 3179 } 3180 3181 /** MedianOperator produces the median of all the values. This is a numeric operator. 3182 * 3183 * All the field values are stored in memory as part of this calculation. This is 3184 * handled by unique key value lists. 3185 */ 3186 final class MedianOperator : SingleFieldOperator 3187 { 3188 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3189 { 3190 super("median", fieldIndex, missingPolicy); 3191 setSaveFieldValuesNumeric(); 3192 } 3193 3194 final override SingleFieldCalculator makeCalculator() 3195 { 3196 return new MedianCalculator(fieldIndex); 3197 } 3198 3199 final class MedianCalculator : SingleFieldCalculator 3200 { 3201 this(size_t fieldIndex) 3202 { 3203 super(fieldIndex); 3204 } 3205 3206 final override MedianOperator getOperator() 3207 { 3208 return this.outer; 3209 } 3210 3211 /* Work is done by saving the field values. */ 3212 final override void processNextField(const char[] nextField) 3213 { } 3214 3215 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3216 { 3217 return printOptions.formatNumber(valuesLists.numericValuesMedian(fieldIndex)); 3218 } 3219 } 3220 } 3221 3222 unittest // MedianOperator 3223 { 3224 auto col1File = [["10"], ["9.5"], ["7.5"]]; 3225 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3226 auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]]; 3227 3228 testSingleFieldOperator!MedianOperator(col1File, 0, "median", ["nan", "10", "9.75", "9.5"]); 3229 testSingleFieldOperator!MedianOperator(col2File, 0, "median", ["nan", "20", "20.5", "21"]); 3230 testSingleFieldOperator!MedianOperator(col2File, 1, "median", ["nan", "-30", "-29.5", "-30"]); 3231 testSingleFieldOperator!MedianOperator(col3File, 0, "median", ["nan", "9009", "4509", "4509"]); 3232 testSingleFieldOperator!MedianOperator(col3File, 1, "median", ["nan", "9", "4.5", "0"]); 3233 testSingleFieldOperator!MedianOperator(col3File, 2, "median", ["nan", "-4.5", "-3", "-1.5"]); 3234 3235 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]]; 3236 testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "nan", "10", "10", "9.75", "9.5"], 3237 new MissingFieldPolicy(true, "")); // Exclude missing 3238 testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "0", "5", "0", "4.75", "7.5"], 3239 new MissingFieldPolicy(false, "0")); // Replace missing 3240 } 3241 3242 /** QuantileOperator produces the value representing the data at a cummulative probability. 3243 * This is a numeric operation. 3244 * 3245 * As an example, quantiles might be produced for the 0.25, 0.5, and 0.75 probabilities 3246 * (alternately, the 25th, 50th, and 75th percentile ranks, the 50th percentile being the 3247 * median). Data is sorted is ascending order. This operator takes one percentile, but it 3248 * is common to generate multiple quantile ranks for the same field when summarizing. 3249 * 3250 * All the field's values are stored in memory as part of this calculation. This is 3251 * handled by unique key value lists. 3252 */ 3253 final class QuantileOperator : SingleFieldOperator 3254 { 3255 private double _prob; 3256 3257 this(size_t fieldIndex, MissingFieldPolicy missingPolicy, double probability) 3258 { 3259 assert(0.0 <= probability && probability <= 1.0); 3260 import std.format : format; 3261 3262 string header = (probability == 0.0) ? "pct0" : format("pct%02g", probability * 100.0); 3263 super(header, fieldIndex, missingPolicy); 3264 _prob = probability; 3265 setSaveFieldValuesNumeric(); 3266 } 3267 3268 final override SingleFieldCalculator makeCalculator() 3269 { 3270 return new QuantileCalculator(fieldIndex); 3271 } 3272 3273 final class QuantileCalculator : SingleFieldCalculator 3274 { 3275 this(size_t fieldIndex) 3276 { 3277 super(fieldIndex); 3278 } 3279 3280 final override QuantileOperator getOperator() 3281 { 3282 return this.outer; 3283 } 3284 3285 /* Work is done by saving the field values. */ 3286 final override void processNextField(const char[] nextField) 3287 { } 3288 3289 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3290 { 3291 import tsv_utils.common.numerics : quantile; 3292 return printOptions.formatNumber( 3293 quantile(this.outer._prob, valuesLists.numericValuesSorted(fieldIndex))); 3294 } 3295 } 3296 } 3297 3298 unittest // QuantileOperator 3299 { 3300 auto col1File = [["10"], ["9.5"], ["7.5"]]; 3301 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3302 auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]]; 3303 3304 auto defaultMissing = new MissingFieldPolicy; 3305 3306 /* Same as the median tests. */ 3307 testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct50", ["nan", "10", "9.75", "9.5"], defaultMissing, 0.50); 3308 testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct50", ["nan", "20", "20.5", "21"], defaultMissing, 0.50); 3309 testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct50", ["nan", "-30", "-29.5", "-30"], defaultMissing, 0.50); 3310 testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct50", ["nan", "9009", "4509", "4509"], defaultMissing, 0.50); 3311 testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct50", ["nan", "9", "4.5", "0"], defaultMissing, 0.50); 3312 testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct50", ["nan", "-4.5", "-3", "-1.5"], defaultMissing, 0.50); 3313 3314 /* The extremes (0, 1), are min and max. */ 3315 testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct0", ["nan", "10", "9.5", "7.5"], defaultMissing, 0.0); 3316 testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct0", ["nan", "20", "20", "20"], defaultMissing, 0.0); 3317 testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct0", ["nan", "-30", "-30", "-31"], defaultMissing, 0.0); 3318 testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct0", ["nan", "9009", "9", "9"], defaultMissing, 0.0); 3319 testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct0", ["nan", "9", "0", "-3"], defaultMissing, 0.0); 3320 testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct0", ["nan", "-4.5", "-4.5", "-4.5"], defaultMissing, 0.0); 3321 3322 testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct100", ["nan", "10", "10", "10"], defaultMissing, 1.0); 3323 testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct100", ["nan", "20", "21", "22"], defaultMissing, 1.0); 3324 testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct100", ["nan", "-30", "-29", "-29"], defaultMissing, 1.0); 3325 testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct100", ["nan", "9009", "9009", "9009"], defaultMissing, 1.0); 3326 testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct100", ["nan", "9", "9", "9"], defaultMissing, 1.0); 3327 testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct100", ["nan", "-4.5", "-1.5", "12"], defaultMissing, 1.0); 3328 3329 /* For missing policies, re-use the median tests. */ 3330 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]]; 3331 testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "nan", "10", "10", "9.75", "9.5"], 3332 new MissingFieldPolicy(true, ""), 0.5); // Exclude missing 3333 testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "0", "5", "0", "4.75", "7.5"], 3334 new MissingFieldPolicy(false, "0"), 0.5); // Replace missing 3335 } 3336 3337 /** MadOperator produces the median absolute deviation from the median. This is a numeric 3338 * operation. 3339 * 3340 * The result is the raw MAD value, without a normalization applied. 3341 * 3342 * All the field values are stored in memory as part of this calculation. This is 3343 * handled by unique key value lists. 3344 */ 3345 final class MadOperator : SingleFieldOperator 3346 { 3347 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3348 { 3349 super("mad", fieldIndex, missingPolicy); 3350 setSaveFieldValuesNumeric(); 3351 } 3352 3353 final override SingleFieldCalculator makeCalculator() 3354 { 3355 return new MadCalculator(fieldIndex); 3356 } 3357 3358 final class MadCalculator : SingleFieldCalculator 3359 { 3360 this(size_t fieldIndex) 3361 { 3362 super(fieldIndex); 3363 } 3364 3365 final override MadOperator getOperator() 3366 { 3367 return this.outer; 3368 } 3369 3370 /* Work is done by saving the field values. */ 3371 final override void processNextField(const char[] nextField) 3372 { } 3373 3374 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3375 { 3376 import std.math : abs; 3377 import tsv_utils.common.numerics : rangeMedian; 3378 3379 auto median = valuesLists.numericValuesMedian(fieldIndex); 3380 auto values = valuesLists.numericValues(fieldIndex); 3381 auto medianDevs = new double[values.length]; 3382 foreach (size_t i, double v; values) 3383 medianDevs[i] = abs(v - median); 3384 3385 return printOptions.formatNumber(medianDevs.rangeMedian); 3386 } 3387 } 3388 } 3389 3390 unittest // MadOperator 3391 { 3392 auto col1File = [["10"], ["15"], ["20"], ["25"], ["30"]]; 3393 auto col2File = [["2", "50"], ["2", "51"], ["2", "52"]]; 3394 auto col3File = [["16", "8", "-4"], ["8", "8", "-2"], ["8", "16", "0"]]; 3395 3396 testSingleFieldOperator!MadOperator(col1File, 0, "mad", ["nan", "0", "2.5", "5", "5", "5"]); 3397 testSingleFieldOperator!MadOperator(col2File, 0, "mad", ["nan", "0", "0", "0"]); 3398 testSingleFieldOperator!MadOperator(col2File, 1, "mad", ["nan", "0", "0.5", "1"]); 3399 testSingleFieldOperator!MadOperator(col3File, 0, "mad", ["nan", "0", "4", "0"]); 3400 testSingleFieldOperator!MadOperator(col3File, 1, "mad", ["nan", "0", "0", "0"]); 3401 testSingleFieldOperator!MadOperator(col3File, 2, "mad", ["nan", "0", "1", "2"]); 3402 3403 auto col1misFile = [[""], ["16"], [""], ["32"], ["-4"]]; 3404 testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "nan", "0", "0", "8", "16"], 3405 new MissingFieldPolicy(true, "")); // Exclude missing 3406 testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "0", "8", "0", "8", "4"], 3407 new MissingFieldPolicy(false, "0")); // Replace missing 3408 } 3409 3410 /** Generates the variance of the fields values. This is a numeric operator. 3411 */ 3412 final class VarianceOperator : SingleFieldOperator 3413 { 3414 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3415 { 3416 super("var", fieldIndex, missingPolicy); 3417 } 3418 3419 final override SingleFieldCalculator makeCalculator() 3420 { 3421 return new VarianceCalculator(fieldIndex); 3422 } 3423 3424 final class VarianceCalculator : SingleFieldCalculator 3425 { 3426 private double _count = 0.0; 3427 private double _mean = 0.0; 3428 private double _m2 = 0.0; // Sum of squares of differences from current mean 3429 3430 this(size_t fieldIndex) 3431 { 3432 super(fieldIndex); 3433 } 3434 3435 final override VarianceOperator getOperator() 3436 { 3437 return this.outer; 3438 } 3439 3440 final override void processNextField(const char[] nextField) 3441 { 3442 _count += 1.0; 3443 double fieldValue = nextField.to!double; 3444 double delta = fieldValue - _mean; 3445 _mean += delta / _count; 3446 _m2 += delta * (fieldValue - _mean); 3447 } 3448 3449 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3450 { 3451 return printOptions.formatNumber( 3452 (_count >= 2.0) ? (_m2 / (_count - 1.0)) : double.nan); 3453 } 3454 } 3455 } 3456 3457 unittest // VarianceOperator 3458 { 3459 auto col1File = [["5"], ["10"], ["15"]]; 3460 auto col2File = [["-5", "-5"], ["-10", "0"], ["-15", "5"]]; 3461 auto col3File = [["1", "2", "100"], ["2", "3", "100"], ["3", "4", "103"]]; 3462 3463 testSingleFieldOperator!VarianceOperator(col1File, 0, "var", ["nan", "nan", "12.5", "25"]); 3464 testSingleFieldOperator!VarianceOperator(col2File, 0, "var", ["nan", "nan", "12.5", "25"]); 3465 testSingleFieldOperator!VarianceOperator(col2File, 1, "var", ["nan", "nan", "12.5", "25"]); 3466 testSingleFieldOperator!VarianceOperator(col3File, 0, "var", ["nan", "nan", "0.5", "1"]); 3467 testSingleFieldOperator!VarianceOperator(col3File, 1, "var", ["nan", "nan", "0.5", "1"]); 3468 testSingleFieldOperator!VarianceOperator(col3File, 2, "var", ["nan", "nan", "0", "3"]); 3469 3470 auto col1misFile = [["5"], ["10"], [""]]; 3471 testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "12.5"], 3472 new MissingFieldPolicy(true, "")); // Exclude missing 3473 testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "25"], 3474 new MissingFieldPolicy(false, "15")); // Replace missing 3475 } 3476 3477 /** Generates the standard deviation of the fields values. This is a numeric operator. 3478 */ 3479 final class StDevOperator : SingleFieldOperator 3480 { 3481 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3482 { 3483 super("stdev", fieldIndex, missingPolicy); 3484 } 3485 3486 final override SingleFieldCalculator makeCalculator() 3487 { 3488 return new StDevCalculator(fieldIndex); 3489 } 3490 3491 final class StDevCalculator : SingleFieldCalculator 3492 { 3493 private double _count = 0.0; 3494 private double _mean = 0.0; 3495 private double _m2 = 0.0; // Sum of squares of differences from current mean 3496 3497 this(size_t fieldIndex) 3498 { 3499 super(fieldIndex); 3500 } 3501 3502 final override StDevOperator getOperator() 3503 { 3504 return this.outer; 3505 } 3506 3507 final override void processNextField(const char[] nextField) 3508 { 3509 _count += 1.0; 3510 double fieldValue = nextField.to!double; 3511 double delta = fieldValue - _mean; 3512 _mean += delta / _count; 3513 _m2 += delta * (fieldValue - _mean); 3514 } 3515 3516 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3517 { 3518 import std.math : sqrt; 3519 return printOptions.formatNumber( 3520 (_count >= 2.0) ? (_m2 / (_count - 1.0)).sqrt : double.nan); 3521 } 3522 } 3523 } 3524 3525 /* StDevOperator unit tests - These would be improved with a tolerance option. 3526 */ 3527 unittest 3528 { 3529 auto col1File = [["1"], ["4"], ["7"]]; 3530 auto col2File = [["3", "3"], ["3", "9"], ["7", "15"]]; 3531 auto col3File = [["11", "10", "10"], ["24", "22", "25"], ["37", "34", "40"]]; 3532 3533 testSingleFieldOperator!StDevOperator(col1File, 0, "stdev", ["nan", "nan", "2.12132034356", "3"]); 3534 testSingleFieldOperator!StDevOperator(col2File, 0, "stdev", ["nan", "nan", "0", "2.30940107676"]); 3535 testSingleFieldOperator!StDevOperator(col2File, 1, "stdev", ["nan", "nan", "4.24264068712", "6"]); 3536 testSingleFieldOperator!StDevOperator(col3File, 0, "stdev", ["nan", "nan", "9.19238815543", "13"]); 3537 testSingleFieldOperator!StDevOperator(col3File, 1, "stdev", ["nan", "nan", "8.48528137424", "12"]); 3538 testSingleFieldOperator!StDevOperator(col3File, 2, "stdev", ["nan", "nan", "10.6066017178", "15"]); 3539 3540 auto col1misFile = [["1"], ["4"], [""]]; 3541 testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "2.12132034356"], 3542 new MissingFieldPolicy(true, "")); // Exclude missing 3543 testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "3"], 3544 new MissingFieldPolicy(false, "7")); // Replace missing 3545 } 3546 3547 /** UniqueCountOperator generates the number of unique values. Unique values are 3548 * based on exact text match calculation, not a numeric comparison. 3549 * 3550 * All the unique field values are stored in memory as part of this calculation. 3551 */ 3552 final class UniqueCountOperator : SingleFieldOperator 3553 { 3554 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3555 { 3556 super("unique_count", fieldIndex, missingPolicy); 3557 } 3558 3559 final override SingleFieldCalculator makeCalculator() 3560 { 3561 return new UniqueCountCalculator(fieldIndex); 3562 } 3563 3564 final class UniqueCountCalculator : SingleFieldCalculator 3565 { 3566 private bool[string] _values; 3567 3568 this(size_t fieldIndex) 3569 { 3570 super(fieldIndex); 3571 } 3572 3573 final override UniqueCountOperator getOperator() 3574 { 3575 return this.outer; 3576 } 3577 3578 final override void processNextField(const char[] nextField) 3579 { 3580 if (nextField !in _values) _values[nextField.to!string] = true; 3581 } 3582 3583 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3584 { 3585 return printOptions.formatNumber(_values.length); 3586 } 3587 } 3588 } 3589 3590 unittest // UniqueCount 3591 { 3592 auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]]; 3593 auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]]; 3594 auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]]; 3595 3596 testSingleFieldOperator!UniqueCountOperator(col1File, 0, "unique_count", ["0", "1", "2", "3", "3", "3", "3", "3", "4"]); 3597 testSingleFieldOperator!UniqueCountOperator(col2File, 0, "unique_count", ["0", "1", "2", "2"]); 3598 testSingleFieldOperator!UniqueCountOperator(col2File, 1, "unique_count", ["0", "1", "1", "2"]); 3599 testSingleFieldOperator!UniqueCountOperator(col3File, 0, "unique_count", ["0", "1", "2", "3"]); 3600 testSingleFieldOperator!UniqueCountOperator(col3File, 1, "unique_count", ["0", "1", "2", "2"]); 3601 testSingleFieldOperator!UniqueCountOperator(col3File, 2, "unique_count", ["0", "1", "2", "3"]); 3602 3603 auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]]; 3604 testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "0", "1", "1", "2", "3", "3", "3", "3", "3", "4"], 3605 new MissingFieldPolicy(true, "")); // Exclude missing 3606 3607 3608 testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "1", "2", "2", "3", "4", "4", "4", "4", "4", "5"], 3609 new MissingFieldPolicy(false, "XYZ")); // Replace missing 3610 } 3611 3612 /** MissingCountOperator generates the number of missing values. This overrides 3613 * the global missingFieldsPolicy. 3614 */ 3615 final class MissingCountOperator : SingleFieldOperator 3616 { 3617 private MissingFieldPolicy _globalMissingPolicy; 3618 3619 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3620 { 3621 _globalMissingPolicy = missingPolicy; 3622 super("missing_count", fieldIndex, new MissingFieldPolicy(false, "")); 3623 } 3624 3625 final override SingleFieldCalculator makeCalculator() 3626 { 3627 return new MissingCountCalculator(fieldIndex); 3628 } 3629 3630 final class MissingCountCalculator : SingleFieldCalculator 3631 { 3632 private size_t _missingCount = 0; 3633 3634 this(size_t fieldIndex) 3635 { 3636 super(fieldIndex); 3637 } 3638 3639 final override MissingCountOperator getOperator() 3640 { 3641 return this.outer; 3642 } 3643 3644 final override void processNextField(const char[] nextField) 3645 { 3646 if (this.outer._globalMissingPolicy.isMissingField(nextField)) _missingCount++; 3647 } 3648 3649 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3650 { 3651 return printOptions.formatNumber(_missingCount); 3652 } 3653 } 3654 } 3655 3656 unittest // MissingCount 3657 { 3658 auto col1File = [["a"], ["b"], [""], [" "], [""]]; 3659 auto col2File = [["abc", ""], ["", ""], ["def", ""]]; 3660 auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]]; 3661 3662 testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"]); 3663 testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"]); 3664 testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"]); 3665 testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"]); 3666 testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"]); 3667 testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"]); 3668 3669 auto excludeMissing = new MissingFieldPolicy(true, ""); 3670 auto replaceMissing = new MissingFieldPolicy(false, "X"); 3671 3672 testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], excludeMissing); 3673 testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], excludeMissing); 3674 testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], excludeMissing); 3675 testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], excludeMissing); 3676 testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], excludeMissing); 3677 testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], excludeMissing); 3678 3679 testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], replaceMissing); 3680 testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], replaceMissing); 3681 testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], replaceMissing); 3682 testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], replaceMissing); 3683 testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], replaceMissing); 3684 testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], replaceMissing); 3685 } 3686 3687 /** NotMissingCountOperator generates the number of not-missing values. This overrides 3688 * the global missingFieldsPolicy. 3689 */ 3690 final class NotMissingCountOperator : SingleFieldOperator 3691 { 3692 private MissingFieldPolicy _globalMissingPolicy; 3693 3694 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3695 { 3696 _globalMissingPolicy = missingPolicy; 3697 super("not_missing_count", fieldIndex, new MissingFieldPolicy(false, "")); 3698 } 3699 3700 final override SingleFieldCalculator makeCalculator() 3701 { 3702 return new NotMissingCountCalculator(fieldIndex); 3703 } 3704 3705 final class NotMissingCountCalculator : SingleFieldCalculator 3706 { 3707 private size_t _notMissingCount = 0; 3708 3709 this(size_t fieldIndex) 3710 { 3711 super(fieldIndex); 3712 } 3713 3714 final override NotMissingCountOperator getOperator() 3715 { 3716 return this.outer; 3717 } 3718 3719 final override void processNextField(const char[] nextField) 3720 { 3721 if (!this.outer._globalMissingPolicy.isMissingField(nextField)) _notMissingCount++; 3722 } 3723 3724 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3725 { 3726 return printOptions.formatNumber(_notMissingCount); 3727 } 3728 } 3729 } 3730 3731 unittest // NotMissingCount 3732 { 3733 auto col1File = [["a"], ["b"], [""], [" "], [""]]; 3734 auto col2File = [["abc", ""], ["", ""], ["def", ""]]; 3735 auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]]; 3736 3737 testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"]); 3738 testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"]); 3739 testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"]); 3740 testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"]); 3741 testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"]); 3742 testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"]); 3743 3744 auto excludeMissing = new MissingFieldPolicy(true, ""); 3745 auto replaceMissing = new MissingFieldPolicy(false, "X"); 3746 3747 testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], excludeMissing); 3748 testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], excludeMissing); 3749 testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], excludeMissing); 3750 testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], excludeMissing); 3751 testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], excludeMissing); 3752 testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], excludeMissing); 3753 3754 testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], replaceMissing); 3755 testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], replaceMissing); 3756 testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], replaceMissing); 3757 testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], replaceMissing); 3758 testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], replaceMissing); 3759 testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], replaceMissing); 3760 } 3761 3762 /** ModeOperator outputs the most frequent value seen. In the event of a tie, the 3763 * first value seen is produced. 3764 * 3765 * All the field values are stored in memory as part of this calculation. 3766 * 3767 */ 3768 final class ModeOperator : SingleFieldOperator 3769 { 3770 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3771 { 3772 super("mode", fieldIndex, missingPolicy); 3773 } 3774 3775 final override SingleFieldCalculator makeCalculator() 3776 { 3777 return new ModeCalculator(fieldIndex); 3778 } 3779 3780 final class ModeCalculator : SingleFieldCalculator 3781 { 3782 private size_t[string] _valueCounts; 3783 private Appender!(string[]) _uniqueValues; 3784 3785 this(size_t fieldIndex) 3786 { 3787 super(fieldIndex); 3788 } 3789 3790 final override ModeOperator getOperator() 3791 { 3792 return this.outer; 3793 } 3794 3795 final override void processNextField(const char[] nextField) 3796 { 3797 auto countPtr = (nextField in _valueCounts); 3798 3799 if (countPtr is null) 3800 { 3801 string value = nextField.to!string; 3802 _uniqueValues.put(value); 3803 _valueCounts[value] = 1; 3804 } 3805 else 3806 { 3807 (*countPtr)++; 3808 } 3809 } 3810 3811 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3812 { 3813 string modeValue = ""; 3814 size_t modeCount = 0; 3815 3816 foreach (value; _uniqueValues.data) 3817 { 3818 assert(value in _valueCounts); 3819 3820 auto count = _valueCounts[value]; 3821 3822 if (count > modeCount) 3823 { 3824 modeValue = value; 3825 modeCount = count; 3826 } 3827 } 3828 3829 return modeValue; 3830 } 3831 } 3832 } 3833 3834 unittest // ModeOperator 3835 { 3836 auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]]; 3837 auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]]; 3838 auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]]; 3839 3840 testSingleFieldOperator!ModeOperator(col1File, 0, "mode", ["", "a", "a", "a", "c", "b", "b", "b"]); 3841 testSingleFieldOperator!ModeOperator(col2File, 0, "mode", ["", "abc", "abc", "def"]); 3842 testSingleFieldOperator!ModeOperator(col2File, 1, "mode", ["", "pqr", "pqr", "pqr"]); 3843 testSingleFieldOperator!ModeOperator(col3File, 0, "mode", ["", "1.0", "1.0", "1.0"]); 3844 testSingleFieldOperator!ModeOperator(col3File, 1, "mode", ["", "1", "1", "a"]); 3845 testSingleFieldOperator!ModeOperator(col3File, 2, "mode", ["", "a", "a", "a"]); 3846 3847 auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]]; 3848 testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "", "a", "a", "a", "a", "c", "b", "b"], 3849 new MissingFieldPolicy(true, "")); // Exclude missing 3850 3851 3852 testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "X", "X", "X", "X", "X", "X", "X", "b"], 3853 new MissingFieldPolicy(false, "X")); // Replace missing 3854 } 3855 3856 /** ModeCountOperator outputs the count of the most frequent value seen. 3857 * 3858 * All the field values are stored in memory as part of this calculation. 3859 * 3860 */ 3861 final class ModeCountOperator : SingleFieldOperator 3862 { 3863 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3864 { 3865 super("mode_count", fieldIndex, missingPolicy); 3866 } 3867 3868 final override SingleFieldCalculator makeCalculator() 3869 { 3870 return new ModeCountCalculator(fieldIndex); 3871 } 3872 3873 final class ModeCountCalculator : SingleFieldCalculator 3874 { 3875 private size_t[string] _valueCounts; 3876 3877 this(size_t fieldIndex) 3878 { 3879 super(fieldIndex); 3880 } 3881 3882 final override ModeCountOperator getOperator() 3883 { 3884 return this.outer; 3885 } 3886 3887 final override void processNextField(const char[] nextField) 3888 { 3889 auto countPtr = (nextField in _valueCounts); 3890 3891 if (countPtr is null) 3892 { 3893 string value = nextField.to!string; 3894 _valueCounts[value] = 1; 3895 } 3896 else 3897 { 3898 (*countPtr)++; 3899 } 3900 } 3901 3902 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3903 { 3904 size_t modeCount = 0; 3905 foreach (count; _valueCounts.byValue) if (count > modeCount) modeCount = count; 3906 return printOptions.formatNumber(modeCount); 3907 } 3908 } 3909 } 3910 3911 unittest // ModeCountOperator 3912 { 3913 auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]]; 3914 auto col2File = [["abc", ""], ["def", ""], ["def", "xyz"]]; 3915 auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]]; 3916 3917 testSingleFieldOperator!ModeCountOperator(col1File, 0, "mode_count", ["0", "1", "1", "1", "2", "2", "3", "3"]); 3918 testSingleFieldOperator!ModeCountOperator(col2File, 0, "mode_count", ["0", "1", "1", "2"]); 3919 testSingleFieldOperator!ModeCountOperator(col2File, 1, "mode_count", ["0", "1", "2", "2"]); 3920 testSingleFieldOperator!ModeCountOperator(col3File, 0, "mode_count", ["0", "1", "1", "1"]); 3921 testSingleFieldOperator!ModeCountOperator(col3File, 1, "mode_count", ["0", "1", "1", "2"]); 3922 testSingleFieldOperator!ModeCountOperator(col3File, 2, "mode_count", ["0", "1", "1", "1"]); 3923 3924 auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]]; 3925 testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "0", "1", "1", "1", "1", "2", "2", "3"], 3926 new MissingFieldPolicy(true, "")); // Exclude missing 3927 3928 3929 testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "1", "1", "2", "2", "2", "2", "2", "3"], 3930 new MissingFieldPolicy(false, "X")); // Replace missing 3931 } 3932 3933 /** ValuesOperator outputs each value delimited by an alternate delimiter character. 3934 * 3935 * All the field values are stored in memory as part of this calculation. This is 3936 * handled by unique key value lists. 3937 */ 3938 3939 final class ValuesOperator : SingleFieldOperator 3940 { 3941 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3942 { 3943 super("values", fieldIndex, missingPolicy); 3944 setSaveFieldValuesText(); 3945 } 3946 3947 final override SingleFieldCalculator makeCalculator() 3948 { 3949 return new ValuesCalculator(fieldIndex); 3950 } 3951 3952 final class ValuesCalculator : SingleFieldCalculator 3953 { 3954 this(size_t fieldIndex) 3955 { 3956 super(fieldIndex); 3957 } 3958 3959 final override ValuesOperator getOperator() 3960 { 3961 return this.outer; 3962 } 3963 3964 /* Work is done by saving the field values. */ 3965 final override void processNextField(const char[] nextField) 3966 { } 3967 3968 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3969 { 3970 return valuesLists.textValues(fieldIndex).join(printOptions.valuesDelimiter); 3971 } 3972 } 3973 } 3974 3975 unittest // ValuesOperator 3976 { 3977 auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]]; 3978 auto col2File = [["", "50"], ["", "51"], ["xyz", "52"]]; 3979 auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "---"]]; 3980 3981 testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e|", "a||b|cd|e||a"]); 3982 testSingleFieldOperator!ValuesOperator(col2File, 0, "values", ["", "", "|", "||xyz"]); 3983 testSingleFieldOperator!ValuesOperator(col2File, 1, "values", ["", "50", "50|51", "50|51|52"]); 3984 testSingleFieldOperator!ValuesOperator(col3File, 0, "values", ["", "z", "z|y", "z|y|w"]); 3985 testSingleFieldOperator!ValuesOperator(col3File, 1, "values", ["", "a", "a|ab", "a|ab|ba"]); 3986 testSingleFieldOperator!ValuesOperator(col3File, 2, "values", ["", "-", "-|--", "-|--|---"]); 3987 3988 testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e|a"], 3989 new MissingFieldPolicy(true, "")); // Exclude missing 3990 3991 3992 testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e|X", "a|X|b|cd|e|X|a"], 3993 new MissingFieldPolicy(false, "X")); // Replace missing 3994 } 3995 3996 /** UniqueValuesOperator outputs each unique value delimited by an alternate delimiter 3997 * character. Values are output in the order seen. 3998 * 3999 * All unique field values are stored in memory as part of this calculation. 4000 * 4001 */ 4002 final class UniqueValuesOperator : SingleFieldOperator 4003 { 4004 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 4005 { 4006 super("unique_values", fieldIndex, missingPolicy); 4007 } 4008 4009 final override SingleFieldCalculator makeCalculator() 4010 { 4011 return new UniqueValuesCalculator(fieldIndex); 4012 } 4013 4014 final class UniqueValuesCalculator : SingleFieldCalculator 4015 { 4016 private size_t[string] _valuesHash; 4017 private Appender!(string[]) _uniqueValues; 4018 4019 this(size_t fieldIndex) 4020 { 4021 super(fieldIndex); 4022 } 4023 4024 final override UniqueValuesOperator getOperator() 4025 { 4026 return this.outer; 4027 } 4028 4029 final override void processNextField(const char[] nextField) 4030 { 4031 auto ptr = (nextField in _valuesHash); 4032 4033 if (ptr is null) 4034 { 4035 string value = nextField.to!string; 4036 _uniqueValues.put(value); 4037 _valuesHash[value] = 1; 4038 } 4039 } 4040 4041 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 4042 { 4043 return _uniqueValues.data.join(printOptions.valuesDelimiter); 4044 } 4045 } 4046 } 4047 4048 unittest // UniqueValuesOperator 4049 { 4050 auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]]; 4051 auto col2File = [["", "50"], ["", "50"], ["xyz", "52"]]; 4052 auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "-"]]; 4053 4054 testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e", "a||b|cd|e"]); 4055 testSingleFieldOperator!UniqueValuesOperator(col2File, 0, "unique_values", ["", "", "", "|xyz"]); 4056 testSingleFieldOperator!UniqueValuesOperator(col2File, 1, "unique_values", ["", "50", "50", "50|52"]); 4057 testSingleFieldOperator!UniqueValuesOperator(col3File, 0, "unique_values", ["", "z", "z|y", "z|y|w"]); 4058 testSingleFieldOperator!UniqueValuesOperator(col3File, 1, "unique_values", ["", "a", "a|ab", "a|ab|ba"]); 4059 testSingleFieldOperator!UniqueValuesOperator(col3File, 2, "unique_values", ["", "-", "-|--", "-|--"]); 4060 4061 testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e"], 4062 new MissingFieldPolicy(true, "")); // Exclude missing 4063 4064 4065 testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e", "a|X|b|cd|e"], 4066 new MissingFieldPolicy(false, "X")); // Replace missing 4067 }