1 /** 2 Command line tool that reads TSV files and summarizes field values associated with 3 equivalent keys. 4 5 Copyright (c) 2016-2021, eBay Inc. 6 Initially written by Jon Degenhardt 7 8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 9 */ 10 module tsv_utils.tsv_summarize; 11 12 import std.algorithm : all, any, canFind, each, find, findSplit, map, joiner, splitter; 13 import std.array : join; 14 import std.conv : to; 15 import std.exception : enforce; 16 import std.format : format; 17 import std.range; 18 import std.stdio; 19 import std.typecons : tuple; 20 import std.container : DList; 21 22 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 23 24 version(unittest) 25 { 26 // When running unit tests, use main from -main compiler switch. 27 } 28 else 29 { 30 int main(string[] cmdArgs) 31 { 32 /* When running in DMD code coverage mode, turn on report merging. */ 33 version(D_Coverage) version(DigitalMars) 34 { 35 import core.runtime : dmd_coverSetMerge; 36 dmd_coverSetMerge(true); 37 } 38 39 TsvSummarizeOptions cmdopt; 40 auto r = cmdopt.processArgs(cmdArgs); 41 if (!r[0]) return r[1]; 42 version(LDC_Profile) 43 { 44 import ldc.profile : resetAll; 45 resetAll(); 46 } 47 try tsvSummarize(cmdopt); 48 catch (Exception exc) 49 { 50 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 51 return 1; 52 } 53 return 0; 54 } 55 } 56 57 auto helpTextVerbose = q"EOS 58 Synopsis: tsv-summarize [options] file [file...] 59 60 tsv-summarize reads tabular data files (tab-separated by default), tracks 61 field values for each unique key, and runs summarization algorithms. Consider 62 the file data.tsv: 63 64 Make Color Time 65 ford blue 131 66 chevy green 124 67 ford red 128 68 bmw black 118 69 bmw black 126 70 ford blue 122 71 72 The min and average times for each make is generated by the command: 73 74 $ tsv-summarize --header --group-by Make --min Time --mean Time data.tsv 75 76 This produces: 77 78 Make Time_min Time_mean 79 ford 122 127 80 chevy 124 124 81 bmw 118 122 82 83 Using '--group-by Make,Color' will group by both 'Make' and 'Color'. 84 Omitting the '--group-by' entirely summarizes fields for the full file. 85 86 The previous example uses field names to identify fields. Field numbers 87 can be used as well. The next two commands are equivalent: 88 89 $ tsv-summarize -H --group-by Make,Color --min Time --mean Time data.tsv 90 $ tsv-summarize -H --group-by 1,2 --min 3 --mean 3 data.tsv 91 92 The program tries to generate useful headers, but custom headers can be 93 specified. Example (using -g and -H shortcuts for --header and --group-by): 94 95 $ tsv-summarize -H -g 1 --min 3:Fastest --mean 3:Average data.tsv 96 97 Most operators take custom headers in a similarly way, generally following: 98 99 --<operator-name> FIELD[:header] 100 101 Operators can be specified multiple times. They can also take multiple 102 fields (though not when a custom header is specified). Examples: 103 104 --median 2,3,4 105 --median 2-5,7-11 106 --median elapsed_time,system_time,user_time 107 --median '*_time' # Wildcard. All fields ending in '_time'. 108 109 The quantile operator requires one or more probabilities after the fields: 110 111 --quantile run_time:0.25 # Quantile 1 of the 'run_time' field 112 --quantile 2:0.25 # Quantile 1 of field 2 113 --quantile 2-4:0.25,0.5,0.75 # Q1, Median, Q3 of fields 2, 3, 4 114 115 Summarization operators available are: 116 count range mad values 117 retain sum var unique-values 118 first mean stddev unique-count 119 last median mode missing-count 120 min quantile mode-count not-missing-count 121 max 122 123 Calculated numeric values are printed to 12 significant digits by default. 124 This can be changed using the '--p|float-precision' option. If six or less 125 it sets the number of significant digits after the decimal point. If 126 greater than six it sets the total number of significant digits. 127 128 Calculations hold onto the minimum data needed while reading data. A few 129 operations like median keep all data values in memory. These operations will 130 start to encounter performance issues as available memory becomes scarce. The 131 size that can be handled effectively is machine dependent, but often quite 132 large files can be handled. 133 134 Operations requiring numeric entries will signal an error and terminate 135 processing if a non-numeric entry is found. 136 137 Missing values are not treated specially by default, this can be changed 138 using the '--x|exclude-missing' or '--r|replace-missing' option. The former 139 turns off processing for missing values, the latter uses a replacement value. 140 141 Options: 142 EOS"; 143 144 auto helpText = q"EOS 145 Synopsis: tsv-summarize [options] file [file...] 146 147 tsv-summarize runs aggregation operations on fields in tab-separated value 148 files. Operations can be run against the full input data or grouped by key 149 fields. Fields can be specified either by field number or field name. Use 150 '--help-verbose' for more detailed help. 151 152 Options: 153 EOS"; 154 155 /** Command line options - Container and processing. The processArgs method is used to 156 * process the command line. 157 */ 158 struct TsvSummarizeOptions { 159 import tsv_utils.common.utils : byLineSourceRange, ByLineSourceRange; 160 161 string programName; /// Program name 162 ByLineSourceRange!() inputSources; /// Input Files 163 size_t[] keyFields; /// -g, --group-by 164 bool hasHeader = false; /// --header 165 bool writeHeader = false; /// -w, --write-header 166 char inputFieldDelimiter = '\t'; /// --d|delimiter 167 char valuesDelimiter = '|'; /// --v|values-delimiter 168 size_t floatPrecision = 12; /// --p|float-precision 169 DList!Operator operators; /// Operators, in the order specified. 170 size_t endFieldIndex = 0; /// Derived value. Max field index used plus one. 171 MissingFieldPolicy globalMissingPolicy = new MissingFieldPolicy; /// Derived value. 172 173 /* tsv-summarize operators require access to the header line when the operator is 174 * created. This is because named fields may be used to describe fields names. To 175 * enable this, a CmdOptionHandler delegate is added to the cmdLinOperatorOptions 176 * array during during initial processing by std.getopt. The group-by operation is 177 * similar, but is added to the cmdLineOtherFieldOptions instead. At least one 178 * cmdLineOperatorOptions entry is required. 179 * 180 * The different handlers are defined after processArgs. 181 */ 182 183 /* CmdOptionHandler delegate signature - This is the call made to process the command 184 * line option arguments after the header line has been read. 185 */ 186 alias CmdOptionHandler = void delegate(bool hasHeader, string[] headerFields); 187 188 private CmdOptionHandler[] cmdLineOperatorOptions; 189 private CmdOptionHandler[] cmdLineOtherFieldOptions; 190 191 /* Returns a tuple. First value is true if command line arguments were successfully 192 * processed and execution should continue, or false if an error occurred or the user 193 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 194 * 195 * Returning true (execution continues) means args have been validated and derived 196 * values calculated. In addition, field indices have been converted to zero-based. 197 */ 198 auto processArgs (ref string[] cmdArgs) { 199 import std.algorithm : any, each; 200 import std.getopt; 201 import std.path : baseName, stripExtension; 202 import std.typecons : Yes, No; 203 import tsv_utils.common.fieldlist : fieldListHelpText; 204 import tsv_utils.common.getopt_inorder; 205 import tsv_utils.common.utils : throwIfWindowsNewline; 206 207 bool helpVerbose = false; // --help-verbose 208 bool helpFields = false; // --help-fields 209 bool versionWanted = false; // --V|version 210 bool excludeMissing = false; // --x|exclude-missing 211 string missingValueReplacement; // --r|replace-missing 212 213 214 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 215 216 try 217 { 218 arraySep = ","; // Use comma to separate values in command line options 219 auto r = getoptInorder( 220 cmdArgs, 221 "help-verbose", " Print full help.", &helpVerbose, 222 "help-fields", " Print help on specifying fields.", &helpFields, 223 224 std.getopt.config.caseSensitive, 225 "V|version", " Print version information and exit.", &versionWanted, 226 std.getopt.config.caseInsensitive, 227 228 "g|group-by", "<field-list> Fields to use as key.", &addGroupByOptionHandler, 229 230 std.getopt.config.caseSensitive, 231 "H|header", " Treat the first line of each file as a header.", &hasHeader, 232 std.getopt.config.caseInsensitive, 233 234 "w|write-header", " Write an output header even if there is no input header.", &writeHeader, 235 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &inputFieldDelimiter, 236 "v|values-delimiter", "CHR Values delimiter. Default: vertical bar (|). (Single byte UTF-8 characters only.)", &valuesDelimiter, 237 "p|float-precision", "NUM 'Precision' to use printing floating point numbers. Affects the number of digits printed and exponent use. Default: 12", &floatPrecision, 238 "x|exclude-missing", " Exclude missing (empty) fields from calculations.", &excludeMissing, 239 "r|replace-missing", "STR Replace missing (empty) fields with STR in calculations.", &missingValueReplacement, 240 "count", " Count occurrences of each unique key ('--g|group-by'), or the total number of records if no key field is specified.", &addCountOptionHandler, 241 "count-header", "STR Count occurrences of each unique key, like '--count', but use STR as the header.", &addCountHeaderOptionHandler, 242 "retain", "<field-list> Retain one copy of the field.", &addOperatorOptionHandler!RetainOperator, 243 "first", "<field-list>[:STR] First value seen.", &addOperatorOptionHandler!FirstOperator, 244 "last", "<field-list>[:STR] Last value seen.", &addOperatorOptionHandler!LastOperator, 245 "min", "<field-list>[:STR] Min value. (Fields with numeric values only.)", &addOperatorOptionHandler!MinOperator, 246 "max", "<field-list>[:STR] Max value. (Fields with numeric values only.)", &addOperatorOptionHandler!MaxOperator, 247 "range", "<field-list>[:STR] Difference between min and max values. (Fields with numeric values only.)", &addOperatorOptionHandler!RangeOperator, 248 "sum", "<field-list>[:STR] Sum of the values. (Fields with numeric values only.)", &addOperatorOptionHandler!SumOperator, 249 "mean", "<field-list>[:STR] Mean (average). (Fields with numeric values only.)", &addOperatorOptionHandler!MeanOperator, 250 "median", "<field-list>[:STR] Median value. (Fields with numeric values only. Reads all values into memory.)", &addOperatorOptionHandler!MedianOperator, 251 "quantile", "<field-list>:p[,p...][:STR] Quantiles. One or more fields, then one or more 0.0-1.0 probabilities. (Fields with numeric values only. Reads all values into memory.)", &addQuantileOperatorOptionHandler, 252 "mad", "<field-list>[:STR] Median absolute deviation from the median. Raw value, not scaled. (Fields with numeric values only. Reads all values into memory.)", &addOperatorOptionHandler!MadOperator, 253 "var", "<field-list>[:STR] Variance. (Sample variance, numeric fields only).", &addOperatorOptionHandler!VarianceOperator, 254 "stdev", "<field-list>[:STR] Standard deviation. (Sample st.dev, numeric fields only).", &addOperatorOptionHandler!StDevOperator, 255 "mode", "<field-list>[:STR] Mode. The most frequent value. (Reads all unique values into memory.)", &addOperatorOptionHandler!ModeOperator, 256 "mode-count", "<field-list>[:STR] Count of the most frequent value. (Reads all unique values into memory.)", &addOperatorOptionHandler!ModeCountOperator, 257 "unique-count", "<field-list>[:STR] Number of unique values. (Reads all unique values into memory.)", &addOperatorOptionHandler!UniqueCountOperator, 258 "missing-count", "<field-list>[:STR] Number of missing (empty) fields. Not affected by '--x|exclude-missing' or '--r|replace-missing'.", &addOperatorOptionHandler!MissingCountOperator, 259 "not-missing-count", "<field-list>[:STR] Number of filled (non-empty) fields. Not affected by '--r|replace-missing'.", &addOperatorOptionHandler!NotMissingCountOperator, 260 "values", "<field-list>[:STR] All the values, separated by --v|values-delimiter. (Reads all values into memory.)", &addOperatorOptionHandler!ValuesOperator, 261 "unique-values", "<field-list>[:STR] All the unique values, separated by --v|values-delimiter. (Reads all unique values into memory.)", &addOperatorOptionHandler!UniqueValuesOperator, 262 ); 263 264 if (r.helpWanted) 265 { 266 defaultGetoptPrinter(helpText, r.options); 267 return tuple(false, 0); 268 } 269 else if (helpVerbose) 270 { 271 defaultGetoptPrinter(helpTextVerbose, r.options); 272 return tuple(false, 0); 273 } 274 else if (helpFields) 275 { 276 writeln(fieldListHelpText); 277 return tuple(false, 0); 278 } 279 else if (versionWanted) 280 { 281 import tsv_utils.common.tsvutils_version; 282 writeln(tsvutilsVersionNotice("tsv-summarize")); 283 return tuple(false, 0); 284 } 285 286 /* Remaining command line args are files. Use standard input if files 287 * were not provided. Truncate cmdArgs to consume the arguments. 288 */ 289 string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 290 cmdArgs.length = 1; 291 292 /* Validation and derivations - Do as much validation prior to header line 293 * processing as possible (avoids waiting on stdin). 294 */ 295 296 enforce(!cmdLineOperatorOptions.empty, "At least one summary operator is required."); 297 298 enforce(inputFieldDelimiter != valuesDelimiter, 299 "Cannot use the same character for both --d|field-delimiter and --v|values-delimiter."); 300 301 enforce(!(excludeMissing && missingValueReplacement.length != 0), 302 "Cannot use both '--x|exclude-missing' and '--r|replace-missing'."); 303 304 /* Missing field policy. */ 305 globalMissingPolicy.updatePolicy(excludeMissing, missingValueReplacement); 306 307 string[] headerFields; 308 309 /* fieldListArgProcessing encapsulates the field list processing. It is 310 * called prior to reading the header line if headers are not being used, 311 * and after if headers are being used. 312 */ 313 void fieldListArgProcessing() 314 { 315 /* Run all the operator handlers. */ 316 cmdLineOtherFieldOptions.each!(dg => dg(hasHeader, headerFields)); 317 cmdLineOperatorOptions.each!(dg => dg(hasHeader, headerFields)); 318 319 /* keyFields need to be part of the endFieldIndex, which is one past 320 * the last field index. */ 321 keyFields.each!(delegate (size_t x) 322 { 323 if (x >= endFieldIndex) endFieldIndex = x + 1; 324 } ); 325 } 326 327 if (!hasHeader) fieldListArgProcessing(); 328 329 /* 330 * Create the byLineSourceRange and perform header line processing. 331 */ 332 inputSources = byLineSourceRange(filepaths); 333 334 335 if (hasHeader) 336 { 337 if (!inputSources.front.byLine.empty) 338 { 339 throwIfWindowsNewline(inputSources.front.byLine.front, inputSources.front.name, 1); 340 headerFields = inputSources.front.byLine.front.split(inputFieldDelimiter).to!(string[]); 341 } 342 343 fieldListArgProcessing(); 344 } 345 } 346 catch (Exception exc) 347 { 348 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 349 return tuple(false, 1); 350 } 351 return tuple(true, 0); 352 } 353 354 private void addGroupByOptionHandler(string option, string optionVal) 355 { 356 cmdLineOtherFieldOptions ~= 357 (bool hasHeader, string[] headerFields) 358 => groupByOptionHandler(hasHeader, headerFields, option, optionVal); 359 } 360 361 private void groupByOptionHandler(bool hasHeader, string[] headerFields, string option, string optionVal) 362 { 363 import tsv_utils.common.fieldlist; 364 365 try 366 { 367 keyFields = 368 optionVal 369 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex)(hasHeader, headerFields) 370 .array; 371 } 372 catch (Exception e) 373 { 374 e.msg = format("[--%s %s]. %s", option, optionVal, e.msg); 375 throw e; 376 } 377 } 378 379 private void addOperatorOptionHandler(OperatorClass : SingleFieldOperator)(string option, string optionVal) 380 { 381 cmdLineOperatorOptions ~= 382 (bool hasHeader, string[] headerFields) 383 => operatorOptionHandler!OperatorClass(hasHeader, headerFields, option, optionVal); 384 } 385 386 /* operationOptionHandler functions are callbacks that process command line options 387 * specifying summarization operations. eg. '--max 5', '--last 3:LastEntry'. Handlers 388 * check syntactic correctness and instantiate Operator objects that do the work. This 389 * is also where 1-upped field numbers are converted to 0-based indices. 390 */ 391 private void operatorOptionHandler(OperatorClass : SingleFieldOperator) 392 (bool hasHeader, string[] headerFields, string option, string optionVal) 393 { 394 import std.range : enumerate; 395 import std.typecons : Yes, No; 396 import tsv_utils.common.fieldlist; 397 398 try 399 { 400 auto optionValParse = 401 optionVal 402 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) 403 (hasHeader, headerFields); 404 405 auto fieldIndices = optionValParse.array; 406 bool hasOptionalHeader = optionVal.length > optionValParse.consumed; 407 string optionalHeader; 408 409 if (hasOptionalHeader) 410 { 411 enforce(fieldIndices.length <= 1, "Cannot specify a custom header when using multiple fields."); 412 enforce(optionVal.length - optionValParse.consumed > 1, 413 format("No value after field list.\n Expected: '--%s <field-list>' or '--%s <field>:<header>'.", 414 option, option)); 415 optionalHeader = optionVal[optionValParse.consumed + 1 .. $].idup; 416 } 417 418 foreach (fieldIndex; fieldIndices) 419 { 420 auto op = new OperatorClass(fieldIndex, globalMissingPolicy); 421 422 if (hasOptionalHeader) 423 { 424 enforce(op.allowCustomHeader, "Operator does not support custom headers."); 425 op.setCustomHeader(optionalHeader); 426 } 427 428 operators.insertBack(op); 429 if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1; 430 } 431 } 432 catch (Exception exc) 433 { 434 import std.format : format; 435 exc.msg = format("[--%s %s] %s", option, optionVal, exc.msg); 436 throw exc; 437 } 438 } 439 440 private void addQuantileOperatorOptionHandler(string option, string optionVal) 441 { 442 cmdLineOperatorOptions ~= 443 (bool hasHeader, string[] headerFields) 444 => quantileOperatorOptionHandler(hasHeader, headerFields, option, optionVal); 445 } 446 447 /* QuantileOperator has a different syntax and needs a custom command option handler. */ 448 private void quantileOperatorOptionHandler(bool hasHeader, string[] headerFields, string option, string optionVal) 449 { 450 import std.typecons : Yes, No; 451 import tsv_utils.common.fieldlist; 452 453 try 454 { 455 auto optionValParse = 456 optionVal 457 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) 458 (hasHeader, headerFields); 459 460 auto fieldIndices = optionValParse.array; 461 enforce(optionVal.length - optionValParse.consumed > 1, "No probabilities entered."); 462 463 auto splitRemaining = 464 optionVal[optionValParse.consumed + 1 .. $] 465 .findSplit(":"); 466 467 enforce(splitRemaining[1].empty || !splitRemaining[2].empty, 468 "Empty custom header."); 469 470 auto probStr = splitRemaining[0]; 471 auto header = splitRemaining[2]; 472 473 double[] probs; 474 475 foreach (str; probStr.splitter(',')) 476 { 477 double p = str.to!double; 478 enforce(p >= 0.0 && p <= 1.0, 479 format("Probability '%g' is not in the interval [0.0,1.0].", p)); 480 probs ~= p; 481 } 482 483 enforce(header.empty || (fieldIndices.length <= 1 && probs.length <= 1), 484 format("Cannot specify a custom header when using multiple fields or multiple probabilities.")); 485 486 assert (fieldIndices.length > 0); 487 assert (probs.length > 0); 488 assert (header.empty || (fieldIndices.length == 1 && probs.length == 1)); 489 490 foreach (fieldIndex; fieldIndices) 491 { 492 foreach (p; probs) 493 { 494 auto op = new QuantileOperator(fieldIndex, globalMissingPolicy, p); 495 if (!header.empty) op.setCustomHeader(header); 496 operators.insertBack(op); 497 } 498 if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1; 499 } 500 } 501 catch (Exception e) 502 { 503 e.msg = format( 504 "[--%s %s]. %s\n Expected: '--%s <field-list>:<prob>[,<prob>]' or '--%s <field>:<prob>:<header>' where <prob> is a number between 0.0 and 1.0.", 505 option, optionVal, e.msg, option, option); 506 throw e; 507 } 508 509 } 510 511 private void addCountOptionHandler() 512 { 513 cmdLineOperatorOptions ~= 514 (bool hasHeader, string[] headerFields) 515 => countOptionHandler(hasHeader, headerFields); 516 } 517 518 private void countOptionHandler(bool hasHeader, string[] headerFields) 519 { 520 operators.insertBack(new CountOperator()); 521 } 522 523 private void addCountHeaderOptionHandler(string option, string optionVal) 524 { 525 cmdLineOperatorOptions ~= 526 (bool hasHeader, string[] headerFields) 527 => countHeaderOptionHandler(hasHeader, headerFields, option, optionVal); 528 } 529 530 private void countHeaderOptionHandler(bool hasHeader, string[] headerFields, string option, string optionVal) 531 { 532 auto op = new CountOperator(); 533 op.setCustomHeader(optionVal); 534 operators.insertBack(op); 535 } 536 } 537 538 /** tsvSummarize does the primary work of the tsv-summarize program. 539 */ 540 void tsvSummarize(ref TsvSummarizeOptions cmdopt) 541 { 542 import tsv_utils.common.utils : BufferedOutputRange, ByLineSourceRange, 543 bufferedByLine, throwIfWindowsNewline; 544 545 /* Check that the input files were setup as expected. Should at least have one 546 * input, stdin if nothing else, and newlines removed from the byLine range. 547 */ 548 assert(!cmdopt.inputSources.empty); 549 static assert(is(typeof(cmdopt.inputSources) == ByLineSourceRange!(No.keepTerminator))); 550 551 /* BufferedOutputRange is faster than writing directly to stdout if many lines are 552 * being written. This will happen mostly when group-by is used. 553 */ 554 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 555 556 /* Pick the Summarizer based on the number of key-fields entered. */ 557 auto summarizer = 558 (cmdopt.keyFields.length == 0) 559 ? new NoKeySummarizer!(typeof(bufferedOutput))( 560 cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 561 562 : (cmdopt.keyFields.length == 1) 563 ? new OneKeySummarizer!(typeof(bufferedOutput))( 564 cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 565 566 : new MultiKeySummarizer!(typeof(bufferedOutput))( 567 cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy); 568 569 /* Add the operators to the Summarizer. */ 570 summarizer.setOperators(inputRangeObject(cmdopt.operators[])); 571 572 /* If there's no input header line, but writing an output header anyway, then 573 * write it now. This helps tasks further on in a unix pipeline detect errors 574 * quickly, without waiting for all the data to flow through the pipeline. 575 */ 576 auto printOptions = SummarizerPrintOptions( 577 cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision); 578 579 if (!cmdopt.hasHeader && cmdopt.writeHeader) 580 { 581 summarizer.writeSummaryHeader(bufferedOutput, printOptions); 582 bufferedOutput.flush; 583 } 584 585 /* Process each input file, one line at a time. */ 586 auto lineFields = new char[][](cmdopt.endFieldIndex); 587 bool headerFound = false; 588 foreach (inputStream; cmdopt.inputSources) 589 { 590 foreach (lineNum, line; inputStream.byLine.enumerate(1)) 591 { 592 if (lineNum == 1) throwIfWindowsNewline(line, inputStream.name, lineNum); 593 594 /* Copy the needed number of fields to the fields array. 595 * Note: The number is zero if no operator needs fields. Notably, the count 596 * operator. Used by itself, it counts the number input lines (ala 'wc -l'). 597 */ 598 if (cmdopt.endFieldIndex > 0) 599 { 600 size_t fieldIndex = 0; 601 foreach (fieldValue; line.splitter(cmdopt.inputFieldDelimiter)) 602 { 603 if (fieldIndex == cmdopt.endFieldIndex) break; 604 lineFields[fieldIndex] = fieldValue; 605 fieldIndex++; 606 } 607 608 if (fieldIndex == 0) 609 { 610 assert(cmdopt.endFieldIndex > 0); 611 assert(line.length == 0); 612 613 /* Bug work-around. Empty lines are not handled properly by splitter. 614 * - Bug: https://issues.dlang.org/show_bug.cgi?id=15735 615 * - Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030 616 * This can arise for: '$ tsv-summarize -g 1 --count'. This counts the 617 * unique values in field 1. If there's only one column, then an empty 618 * line becomes an empty string for field 1. Work-around: Point to the 619 * line. It's an empty string. 620 */ 621 lineFields[fieldIndex] = line; 622 fieldIndex++; 623 } 624 625 enforce(fieldIndex >= cmdopt.endFieldIndex, 626 format("Not enough fields in line. File: %s, Line: %s", 627 inputStream.name, lineNum)); 628 } 629 630 if (cmdopt.hasHeader && lineNum == 1) 631 { 632 if (!headerFound) 633 { 634 summarizer.processHeaderLine(lineFields); 635 headerFound = true; 636 637 /* Write the header now. This helps tasks further on in a unix 638 * pipeline detect errors quickly, without waiting for all the 639 * data to flow through the pipeline. Note that an upstream task 640 * may have flushed its header line, so the header may arrive 641 * long before the main block of data. 642 */ 643 summarizer.writeSummaryHeader(bufferedOutput, printOptions); 644 bufferedOutput.flush; 645 } 646 } 647 else 648 { 649 /* Process the line. Processing will fail (throw) if a field cannot be 650 * converted to the expected type. 651 */ 652 try summarizer.processNextLine(lineFields); 653 catch (Exception exc) 654 { 655 throw new Exception( 656 format("Could not process line or field: %s\n File: %s Line: %s%s", 657 exc.msg, inputStream.name, lineNum, 658 (lineNum == 1) ? "\n Is this a header line? Use --header to skip." : "")); 659 } 660 } 661 } 662 } 663 664 debug writeln("[tsvSummarize] After reading all data."); 665 666 /* Whew! We're done processing input data. Run the calculations and print. */ 667 668 summarizer.writeSummaryBody(bufferedOutput, printOptions); 669 } 670 671 /** The default field header. This is used when the input doesn't have field headers, 672 * but field headers are used in the output. The default is "fieldN", where N is the 673 * 1-upped field number. 674 */ 675 string fieldHeaderFromIndex(size_t fieldIndex) 676 { 677 enum prefix = "field"; 678 return prefix ~ (fieldIndex + 1).to!string; 679 } 680 681 unittest 682 { 683 assert(fieldHeaderFromIndex(0) == "field1"); 684 assert(fieldHeaderFromIndex(10) == "field11"); 685 } 686 687 /** Produce a summary header from a field header. 688 * 689 * The result has the form `<fieldHeader>_<operation>`. e.g. If the field header is 690 * "length" and the operation is "max", the summary header is "length_max". The field 691 * header typically comes a header line in the input data or was constructed by 692 * fieldHeaderFromIndex(). 693 * 694 * If operationName is the empty string, then fieldHeader is used unchanged. This supports 695 * the Retain operator. 696 */ 697 string summaryHeaderFromFieldHeader(string fieldHeader, string operationName) 698 { 699 return (operationName.length > 0) ? fieldHeader ~ "_" ~ operationName : fieldHeader; 700 } 701 702 unittest 703 { 704 assert(summaryHeaderFromFieldHeader("originalfield", "mycalc") == "originalfield_mycalc"); 705 assert(summaryHeaderFromFieldHeader("originalfield", "") == "originalfield"); 706 } 707 708 /** SummarizerPrintOptions holds printing options for Summarizers and Calculators. Typically 709 * specified with command line options, it is separated out for modularity. 710 */ 711 struct SummarizerPrintOptions 712 { 713 char fieldDelimiter; 714 char valuesDelimiter; 715 size_t floatPrecision = 12; 716 717 import std.traits : isFloatingPoint, isIntegral; 718 719 auto formatNumber(T)(T n) const 720 if (isFloatingPoint!T || isIntegral!T) 721 { 722 import tsv_utils.common.numerics : formatNumber; 723 return formatNumber!T(n, floatPrecision); 724 } 725 } 726 727 /** A Summarizer object maintains the state of the summarization and performs basic 728 * processing. Handling of files and input lines is left to the caller. 729 * 730 * Classes supporting the Summarizer must implement the methods: 731 * - setOperators - Called after initializing the object for each operator to be processed. 732 * - processHeaderLine - Called to process the header line of each file. Returns true if 733 * it was the first header line processed (used when reading multiple files). 734 * - processNextLine - Called to process non-header lines. 735 * - writeSummaryHeader - Called to write the header line. 736 * - writeSummaryBody - Called to write the result lines. 737 * 738 */ 739 interface Summarizer(OutputRange) 740 { 741 /** Called after initializing the object for each operator to be processed. */ 742 void setOperators(InputRange!Operator op); 743 744 /** Called to process the header line of each file. Returns true if it was the 745 * first header line processed (used when reading multiple files). 746 */ 747 bool processHeaderLine(const char[][] lineFields); 748 749 /** Called to process non-header lines. */ 750 void processNextLine(const char[][] lineFields); 751 752 /** Called to write the header line. */ 753 void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions); 754 755 /** Called to write the result lines. */ 756 void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions); 757 } 758 759 /** SummarizerBase performs work shared by all sumarizers, most everything except for 760 * handling of unique keys. 761 * 762 * The base class handles creation, allocates storage for Operators and SharedFieldValues, 763 * and similar. Derived classes deal primarily with unique keys and the associated Calculators 764 * and UniqueKeyValuesLists. 765 */ 766 class SummarizerBase(OutputRange) : Summarizer!OutputRange 767 { 768 private char _inputFieldDelimiter; 769 private bool _hasProcessedFirstHeaderLine = false; 770 private SharedFieldValues _sharedFieldValues = null; // Null if no shared field value lists. 771 protected MissingFieldPolicy _missingPolicy; 772 protected DList!Operator _operators; 773 protected size_t _numOperators = 0; 774 775 this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 776 { 777 _inputFieldDelimiter = inputFieldDelimiter; 778 _missingPolicy = missingPolicy; 779 } 780 781 char inputFieldDelimiter() const @property 782 { 783 return _inputFieldDelimiter; 784 } 785 786 /** Sets the Operators used by the Summarizer. Called after construction. */ 787 void setOperators(InputRange!Operator operators) 788 { 789 foreach (op; operators) 790 { 791 _operators.insertBack(op); 792 _numOperators++; 793 auto numericFieldsToSave = op.numericFieldsToSave(); 794 auto textFieldsToSave = op.textFieldsToSave(); 795 796 if (numericFieldsToSave.length > 0 || textFieldsToSave.length > 0) 797 { 798 if (_sharedFieldValues is null) 799 { 800 _sharedFieldValues = new SharedFieldValues(); 801 } 802 numericFieldsToSave.each!(x => _sharedFieldValues.addNumericIndex(x)); 803 textFieldsToSave.each!(x => _sharedFieldValues.addTextIndex(x)); 804 } 805 } 806 } 807 808 /** Called to process the header line of each file. Returns true if it was the 809 * first header line processed (used when reading multiple files). 810 */ 811 bool processHeaderLine(const char[][] lineFields) 812 { 813 if (!_hasProcessedFirstHeaderLine) 814 { 815 _operators.each!(x => x.processHeaderLine(lineFields)); 816 _hasProcessedFirstHeaderLine = true; 817 return true; 818 } 819 else 820 { 821 return false; 822 } 823 } 824 825 protected final UniqueKeyValuesLists makeUniqueKeyValuesLists() 826 { 827 return (_sharedFieldValues is null) 828 ? null 829 : _sharedFieldValues.makeUniqueKeyValuesLists; 830 } 831 832 abstract void processNextLine(const char[][] lineFields); 833 abstract void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions); 834 abstract void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions); 835 } 836 837 /** The NoKeySummarizer is used when summarizing values across the entire input. 838 * 839 * Note: NoKeySummarizer is used in Operator unit tests and gets extensive testing 840 * through that mechanism. 841 */ 842 final class NoKeySummarizer(OutputRange) : SummarizerBase!OutputRange 843 { 844 private Calculator[] _calculators; 845 private UniqueKeyValuesLists _valueLists; 846 847 this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 848 { 849 super(inputFieldDelimiter, missingPolicy); 850 } 851 852 /** Called after initializing the object for each operator to be processed. */ 853 override void setOperators(InputRange!Operator operators) 854 { 855 super.setOperators(operators); 856 857 /* Only one Calculator per Operation, so create them as Operators are added. */ 858 foreach (op; operators) _calculators ~= op.makeCalculator; 859 _valueLists = super.makeUniqueKeyValuesLists(); 860 } 861 862 /** Called to process non-header lines. */ 863 override void processNextLine(const char[][] lineFields) 864 { 865 _calculators.each!(x => x.processNextLine(lineFields)); 866 if (_valueLists !is null) _valueLists.processNextLine(lineFields, _missingPolicy); 867 } 868 869 /** Called to write the header line. */ 870 override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 871 { 872 put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter)); 873 put(outputStream, '\n'); 874 } 875 876 /** Called to write the result lines. */ 877 override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 878 { 879 put(outputStream, 880 _calculators[] 881 .map!(x => x.calculate(_valueLists, printOptions)) 882 .join(printOptions.fieldDelimiter)); 883 put(outputStream, '\n'); 884 } 885 } 886 887 /** KeySummarizerBase does work shared by the single key and multi-key summarizers. 888 * 889 * The primary difference between those two is the formation of the key. The primary 890 * reason for separating those into two separate classes is to simplify (speed-up) 891 * handling of single field keys, which are the most common use case. 892 */ 893 class KeySummarizerBase(OutputRange) : SummarizerBase!OutputRange 894 { 895 protected struct UniqueKeyData 896 { 897 Calculator[] calculators; 898 UniqueKeyValuesLists valuesLists; 899 } 900 901 private DList!string _uniqueKeys; 902 private UniqueKeyData[string] _uniqueKeyData; 903 904 this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 905 { 906 super(inputFieldDelimiter, missingPolicy); 907 } 908 909 protected void processNextLineWithKey(T : const char[])(T key, const char[][] lineFields) 910 { 911 debug writefln("[%s]: %s", __FUNCTION__, lineFields.to!string); 912 913 auto dataPtr = (key in _uniqueKeyData); 914 auto data = (dataPtr is null) ? addUniqueKey(key.to!string) : *dataPtr; 915 916 data.calculators.each!(x => x.processNextLine(lineFields)); 917 if (data.valuesLists !is null) data.valuesLists.processNextLine(lineFields, _missingPolicy); 918 } 919 920 protected UniqueKeyData addUniqueKey(string key) 921 { 922 assert(key !in _uniqueKeyData); 923 924 _uniqueKeys.insertBack(key); 925 926 auto calculators = new Calculator[_numOperators]; 927 size_t i = 0; 928 foreach (op; _operators) 929 { 930 calculators[i] = op.makeCalculator; 931 i++; 932 } 933 934 return _uniqueKeyData[key] = UniqueKeyData(calculators, super.makeUniqueKeyValuesLists()); 935 } 936 937 override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 938 { 939 put(outputStream, keyFieldHeader()); 940 put(outputStream, printOptions.fieldDelimiter); 941 put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter)); 942 put(outputStream, '\n'); 943 } 944 945 override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 946 { 947 foreach(key; _uniqueKeys) 948 { 949 auto data = _uniqueKeyData[key]; 950 put(outputStream, key); 951 put(outputStream, printOptions.fieldDelimiter); 952 put(outputStream, 953 data.calculators[] 954 .map!(x => x.calculate(data.valuesLists, printOptions)) 955 .join(printOptions.fieldDelimiter)); 956 put(outputStream, '\n'); 957 } 958 } 959 960 abstract string keyFieldHeader() const @property; 961 } 962 963 /** This Summarizer is for the case where the unique key is based on exactly one field. 964 */ 965 final class OneKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange 966 { 967 private size_t _keyFieldIndex = 0; 968 private string _keyFieldHeader; 969 private DList!string _uniqueKeys; 970 971 this(size_t keyFieldIndex, char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 972 { 973 super(inputFieldDelimiter, missingPolicy); 974 _keyFieldIndex = keyFieldIndex; 975 _keyFieldHeader = fieldHeaderFromIndex(keyFieldIndex); 976 } 977 978 override string keyFieldHeader() const @property 979 { 980 return _keyFieldHeader; 981 } 982 983 override bool processHeaderLine(const char[][] lineFields) 984 { 985 assert(_keyFieldIndex <= lineFields.length); 986 987 bool isFirstHeaderLine = super.processHeaderLine(lineFields); 988 if (isFirstHeaderLine) 989 { 990 _keyFieldHeader = lineFields[_keyFieldIndex].to!string; 991 } 992 return isFirstHeaderLine; 993 } 994 995 override void processNextLine(const char[][] lineFields) 996 { 997 assert(_keyFieldIndex < lineFields.length); 998 processNextLineWithKey(lineFields[_keyFieldIndex], lineFields); 999 } 1000 } 1001 1002 /** This Summarizer is for the case where the unique key is based on multiple fields. 1003 */ 1004 final class MultiKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange 1005 { 1006 private size_t[] _keyFieldIndices; 1007 private string _keyFieldHeader; 1008 private DList!string _uniqueKeys; 1009 1010 this(const size_t[] keyFieldIndices, char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 1011 { 1012 super(inputFieldDelimiter, missingPolicy); 1013 _keyFieldIndices = keyFieldIndices.dup; 1014 _keyFieldHeader = 1015 _keyFieldIndices.map!(i => fieldHeaderFromIndex(i)) 1016 .join(inputFieldDelimiter); 1017 } 1018 1019 override string keyFieldHeader() const @property 1020 { 1021 return _keyFieldHeader; 1022 } 1023 1024 override bool processHeaderLine(const char[][] lineFields) 1025 { 1026 assert(_keyFieldIndices.all!(x => x < lineFields.length)); 1027 assert(_keyFieldIndices.length >= 2); 1028 1029 bool isFirstHeaderLine = super.processHeaderLine(lineFields); 1030 if (isFirstHeaderLine) 1031 { 1032 _keyFieldHeader = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string; 1033 } 1034 return isFirstHeaderLine; 1035 } 1036 1037 override void processNextLine(const char[][] lineFields) 1038 { 1039 assert(_keyFieldIndices.all!(x => x < lineFields.length)); 1040 assert(_keyFieldIndices.length >= 2); 1041 1042 string key = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string; 1043 processNextLineWithKey(key, lineFields); 1044 } 1045 } 1046 1047 version(unittest) 1048 { 1049 /* testSummarizer is a helper that can run many types of unit tests against 1050 * Summarizers. It can also test operators, but there are separate helper functions 1051 * better suited for that purpose. 1052 * 1053 * Arguments are a command line args, an input file, and expected output. The 1054 * input file and expected output are already split into lines and fields, the helper 1055 * manages re-assembly. The program name from the command line args is printed if an 1056 * an error occurs, it is useful to identify the test that failed. 1057 * 1058 * Note: Much of this is a duplication tsvSummarize logic. Better abstraction of 1059 * file input/output would enable running unit tests directly on top of tsvSummarize. 1060 * 1061 * Update (April 2020): With the introduction of InputSourceRange and ByLineSource, 1062 * there needs to be a physical file when call processArgs. Its hard to get around, 1063 * as the intent is to read the header line of the first input file during command 1064 * line argument processing. Eventually this unit test process will need to be 1065 * rewritten. For now, a file with the equivalent data is being added to the command 1066 * line. 1067 * 1068 * Update (Sept 2020): The physical file needs to be closed for unit tests on 1069 * Windows. This is so the temporary file can be deleted without trouble. Since its 1070 * a placeholder in these tests, it's getting iterated but not popped off the 1071 * inputSources and closed. Normal collection is not closing it quick enought. So 1072 * all inputSources are closed at the end of this function. 1073 */ 1074 void testSummarizer(string[] cmdArgs, string[][] file, string[][] expected) 1075 { 1076 import std.array : appender; 1077 1078 assert(cmdArgs.length > 0, "[testSummarizer] cmdArgs must not be empty."); 1079 1080 auto formatAssertMessage(T...)(string msg, T formatArgs) 1081 { 1082 auto formatString = "[testSummarizer] %s: " ~ msg; 1083 return format(formatString, cmdArgs[0], formatArgs); 1084 } 1085 1086 TsvSummarizeOptions cmdopt; 1087 auto savedCmdArgs = cmdArgs.to!string; 1088 auto r = cmdopt.processArgs(cmdArgs); 1089 assert(r[0], formatAssertMessage("Invalid command line args: '%s'.", savedCmdArgs)); 1090 1091 assert(file.all!(line => line.length >= cmdopt.endFieldIndex), 1092 formatAssertMessage("group-by or operator field number greater than number of fields a line of the input file.")); 1093 1094 /* Pick the Summarizer based on the number of key-fields entered. */ 1095 auto summarizer = 1096 (cmdopt.keyFields.length == 0) 1097 ? new NoKeySummarizer!(typeof(appender!(char[])()))( 1098 cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 1099 1100 : (cmdopt.keyFields.length == 1) 1101 ? new OneKeySummarizer!(typeof(appender!(char[])()))( 1102 cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 1103 1104 : new MultiKeySummarizer!(typeof(appender!(char[])()))( 1105 cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy); 1106 1107 /* Add the operators to the Summarizer. */ 1108 summarizer.setOperators(inputRangeObject(cmdopt.operators[])); 1109 1110 /* Process the file one line at a time. */ 1111 auto lineFields = new char[][](cmdopt.endFieldIndex); 1112 bool headerFound = false; 1113 foreach (lineNum, line; file.enumerate(1)) 1114 { 1115 /* Copy the needed fields to the fields array. */ 1116 foreach (i, val; line[0..cmdopt.endFieldIndex]) lineFields[i] = val.dup; 1117 1118 if (cmdopt.hasHeader && lineNum == 1) 1119 { 1120 if (!headerFound) 1121 { 1122 summarizer.processHeaderLine(lineFields); 1123 headerFound = true; 1124 } 1125 } 1126 else 1127 { 1128 try summarizer.processNextLine(lineFields); 1129 catch (Exception exc) 1130 { 1131 assert(false, formatAssertMessage(exc.msg)); 1132 } 1133 } 1134 } 1135 auto printOptions = SummarizerPrintOptions( 1136 cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision); 1137 1138 auto summarizerOutput = appender!(char[])(); 1139 1140 if (cmdopt.hasHeader || cmdopt.writeHeader) 1141 { 1142 summarizer.writeSummaryHeader(summarizerOutput, printOptions); 1143 } 1144 1145 summarizer.writeSummaryBody(summarizerOutput, printOptions); 1146 auto expectedOutput = expected.map!(x => x.joiner(cmdopt.inputFieldDelimiter.to!string)).joiner("\n").to!string; 1147 if (expectedOutput.length > 0 && expectedOutput[$-1] != '\n') expectedOutput ~= "\n"; 1148 1149 assert(summarizerOutput.data == expectedOutput, 1150 formatAssertMessage( 1151 "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================", 1152 expectedOutput.to!string, summarizerOutput.data.to!string)); 1153 1154 /* Ensure all files are closed by emptying the stack. */ 1155 while (!cmdopt.inputSources.empty) cmdopt.inputSources.popFront; 1156 } 1157 1158 void writeDataFile(string filepath, string[][] fileData, string delimiter = "\t") 1159 { 1160 import std.algorithm; 1161 import std.stdio; 1162 1163 auto f = filepath.File("wb"); 1164 foreach (record; fileData) f.writeln(record.joiner(delimiter)); 1165 f.close; 1166 } 1167 } 1168 1169 unittest 1170 { 1171 import tsv_utils.common.unittest_utils; // tsv unit test helpers, from common/src/. 1172 import std.file : mkdir, rmdirRecurse; 1173 import std.path : buildPath; 1174 1175 auto testDir = makeUnittestTempDir("tsv_summarizer"); 1176 scope(exit) testDir.rmdirRecurse; 1177 1178 /* Summarizer unit tests. Primarily single-key and multi-key summarizers. To a limited 1179 * extent, command line option handling (TsvSummarizeOptions). Individual operators 1180 * have separate tests, those tests test the no-key summarizer. The Values operator is 1181 * used in these tests. It engages a number of behaviors, and the results have limited 1182 * ambiguity. Using only one operator limits dependence on individual operators. 1183 * 1184 * Update (April 2020): There now needs to be a real file passed to testSummarizer. 1185 * See the comments with testSummarizer for details. 1186 */ 1187 1188 auto file1 = [["fld1", "fld2", "fld3"], 1189 ["a", "a", "3"], 1190 ["c", "a", "2b"], 1191 ["c", "bc", ""], 1192 ["a", "c", "2b"], 1193 ["", "bc", ""], 1194 ["c", "bc", "3"]]; 1195 1196 auto file1Path = buildPath(testDir, "file1.tsv"); 1197 auto file1NoHeaderPath = buildPath(testDir, "file1_noheader.tsv"); 1198 writeDataFile(file1Path, file1); 1199 writeDataFile(file1NoHeaderPath, file1[1 .. $]); 1200 1201 /* Single-key summarizer tests. 1202 */ 1203 testSummarizer(["unittest-sk-1", "--header", "--group-by", "1", "--values", "1", file1Path], 1204 file1, 1205 [["fld1", "fld1_values"], 1206 ["a", "a|a"], 1207 ["c", "c|c|c"], 1208 ["", ""]] 1209 ); 1210 testSummarizer(["unittest-sk-1-named", "--header", "--group-by", "fld1", "--values", "fld1", file1Path], 1211 file1, 1212 [["fld1", "fld1_values"], 1213 ["a", "a|a"], 1214 ["c", "c|c|c"], 1215 ["", ""]] 1216 ); 1217 testSummarizer(["unittest-sk-2", "-H", "--group-by", "1", "--values", "2", file1Path], 1218 file1, 1219 [["fld1", "fld2_values"], 1220 ["a", "a|c"], 1221 ["c", "a|bc|bc"], 1222 ["", "bc"]] 1223 ); 1224 testSummarizer(["unittest-sk-2-named", "-H", "--group-by", "fld1", "--values", "fld2", file1Path], 1225 file1, 1226 [["fld1", "fld2_values"], 1227 ["a", "a|c"], 1228 ["c", "a|bc|bc"], 1229 ["", "bc"]] 1230 ); 1231 testSummarizer(["unittest-sk-3", "-H", "-g", "1", "--values", "3", file1Path], 1232 file1, 1233 [["fld1", "fld3_values"], 1234 ["a", "3|2b"], 1235 ["c", "2b||3"], 1236 ["", ""]] 1237 ); 1238 testSummarizer(["unittest-sk-4", "-H", "--group-by", "1", "--values", "1,2,3", file1Path], 1239 file1, 1240 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1241 ["a", "a|a", "a|c", "3|2b"], 1242 ["c", "c|c|c", "a|bc|bc", "2b||3"], 1243 ["", "", "bc", ""]] 1244 ); 1245 testSummarizer(["unittest-sk-4-named-a", "-H", "--group-by", "fld1", "--values", "fld1,fld2,fld3", file1Path], 1246 file1, 1247 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1248 ["a", "a|a", "a|c", "3|2b"], 1249 ["c", "c|c|c", "a|bc|bc", "2b||3"], 1250 ["", "", "bc", ""]] 1251 ); 1252 testSummarizer(["unittest-sk-4-named-b", "-H", "--group-by", "fld1", "--values", "fld*", file1Path], 1253 file1, 1254 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1255 ["a", "a|a", "a|c", "3|2b"], 1256 ["c", "c|c|c", "a|bc|bc", "2b||3"], 1257 ["", "", "bc", ""]] 1258 ); 1259 testSummarizer(["unittest-sk-5", "-H", "--group-by", "1", "--values", "1-3", file1Path], 1260 file1, 1261 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1262 ["a", "a|a", "a|c", "3|2b"], 1263 ["c", "c|c|c", "a|bc|bc", "2b||3"], 1264 ["", "", "bc", ""]] 1265 ); 1266 testSummarizer(["unittest-sk-6", "-H", "--group-by", "1", "--values", "3,2,1", file1Path], 1267 file1, 1268 [["fld1", "fld3_values", "fld2_values", "fld1_values"], 1269 ["a", "3|2b", "a|c", "a|a"], 1270 ["c", "2b||3", "a|bc|bc", "c|c|c"], 1271 ["", "", "bc", ""]] 1272 ); 1273 testSummarizer(["unittest-sk-7", "-H", "--group-by", "1", "--values", "3-1", file1Path], 1274 file1, 1275 [["fld1", "fld3_values", "fld2_values", "fld1_values"], 1276 ["a", "3|2b", "a|c", "a|a"], 1277 ["c", "2b||3", "a|bc|bc", "c|c|c"], 1278 ["", "", "bc", ""]] 1279 ); 1280 testSummarizer(["unittest-sk-8", "-H", "--group-by", "2", "--values", "1", file1Path], 1281 file1, 1282 [["fld2", "fld1_values"], 1283 ["a", "a|c"], 1284 ["bc", "c||c"], 1285 ["c", "a"]] 1286 ); 1287 testSummarizer(["unittest-sk-9", "-H", "--group-by", "2", "--values", "2", file1Path], 1288 file1, 1289 [["fld2", "fld2_values"], 1290 ["a", "a|a"], 1291 ["bc", "bc|bc|bc"], 1292 ["c", "c"]] 1293 ); 1294 testSummarizer(["unittest-sk-10", "-H", "--group-by", "2", "--values", "3", file1Path], 1295 file1, 1296 [["fld2", "fld3_values"], 1297 ["a", "3|2b"], 1298 ["bc", "||3"], 1299 ["c", "2b"]] 1300 ); 1301 testSummarizer(["unittest-sk-11", "-H", "--group-by", "2", "--values", "1,3", file1Path], 1302 file1, 1303 [["fld2", "fld1_values", "fld3_values"], 1304 ["a", "a|c", "3|2b"], 1305 ["bc", "c||c", "||3"], 1306 ["c", "a", "2b"]] 1307 ); 1308 testSummarizer(["unittest-sk-12", "-H", "--group-by", "2", "--values", "3,1", file1Path], 1309 file1, 1310 [["fld2", "fld3_values", "fld1_values"], 1311 ["a", "3|2b", "a|c"], 1312 ["bc", "||3", "c||c"], 1313 ["c", "2b", "a"]] 1314 ); 1315 testSummarizer(["unittest-sk-13", "-H", "--group-by", "3", "--values", "1", file1Path], 1316 file1, 1317 [["fld3", "fld1_values"], 1318 ["3", "a|c"], 1319 ["2b", "c|a"], 1320 ["", "c|"]] 1321 ); 1322 testSummarizer(["unittest-sk-14", "-H", "--group-by", "3", "--values", "2", file1Path], 1323 file1, 1324 [["fld3", "fld2_values"], 1325 ["3", "a|bc"], 1326 ["2b", "a|c"], 1327 ["", "bc|bc"]] 1328 ); 1329 testSummarizer(["unittest-sk-15", "-H", "--group-by", "3", "--values", "1,2", file1Path], 1330 file1, 1331 [["fld3", "fld1_values", "fld2_values"], 1332 ["3", "a|c", "a|bc"], 1333 ["2b", "c|a", "a|c"], 1334 ["", "c|", "bc|bc"]] 1335 ); 1336 testSummarizer(["unittest-sk-15-named", "-H", "--group-by", "fld3", "--values", "fld1,fld2", file1Path], 1337 file1, 1338 [["fld3", "fld1_values", "fld2_values"], 1339 ["3", "a|c", "a|bc"], 1340 ["2b", "c|a", "a|c"], 1341 ["", "c|", "bc|bc"]] 1342 ); 1343 1344 /* Multi-key summarizer tests. 1345 */ 1346 testSummarizer(["unittest-mk-1", "--header", "--group-by", "1,2", "--values", "1", file1Path], 1347 file1, 1348 [["fld1", "fld2", "fld1_values"], 1349 ["a", "a", "a"], 1350 ["c", "a", "c"], 1351 ["c", "bc", "c|c"], 1352 ["a", "c", "a"], 1353 ["", "bc", ""]] 1354 ); 1355 testSummarizer(["unittest-mk-2", "-H", "--group-by", "1,2", "--values", "2", file1Path], 1356 file1, 1357 [["fld1", "fld2", "fld2_values"], 1358 ["a", "a", "a"], 1359 ["c", "a", "a"], 1360 ["c", "bc", "bc|bc"], 1361 ["a", "c", "c"], 1362 ["", "bc", "bc"]] 1363 ); 1364 testSummarizer(["unittest-mk-3", "-H", "--group-by", "1,2", "--values", "3", file1Path], 1365 file1, 1366 [["fld1", "fld2", "fld3_values"], 1367 ["a", "a", "3"], 1368 ["c", "a", "2b"], 1369 ["c", "bc", "|3"], 1370 ["a", "c", "2b"], 1371 ["", "bc", ""]] 1372 ); 1373 testSummarizer(["unittest-mk-4", "-H", "--group-by", "1,2", "--values", "3,1", file1Path], 1374 file1, 1375 [["fld1", "fld2", "fld3_values", "fld1_values"], 1376 ["a", "a", "3", "a"], 1377 ["c", "a", "2b", "c"], 1378 ["c", "bc", "|3", "c|c"], 1379 ["a", "c", "2b", "a"], 1380 ["", "bc", "", ""]] 1381 ); 1382 testSummarizer(["unittest-mk-4-named", "-H", "--group-by", "fld1,fld2", "--values", "fld3,fld1", file1Path], 1383 file1, 1384 [["fld1", "fld2", "fld3_values", "fld1_values"], 1385 ["a", "a", "3", "a"], 1386 ["c", "a", "2b", "c"], 1387 ["c", "bc", "|3", "c|c"], 1388 ["a", "c", "2b", "a"], 1389 ["", "bc", "", ""]] 1390 ); 1391 testSummarizer(["unittest-mk-5", "-H", "--group-by", "3,2", "--values", "1", file1Path], 1392 file1, 1393 [["fld3", "fld2", "fld1_values"], 1394 ["3", "a", "a"], 1395 ["2b", "a", "c"], 1396 ["", "bc", "c|"], 1397 ["2b", "c", "a"], 1398 ["3", "bc", "c"]] 1399 ); 1400 testSummarizer(["unittest-mk-6", "-H", "--group-by", "3-2", "--values", "1", file1Path], 1401 file1, 1402 [["fld3", "fld2", "fld1_values"], 1403 ["3", "a", "a"], 1404 ["2b", "a", "c"], 1405 ["", "bc", "c|"], 1406 ["2b", "c", "a"], 1407 ["3", "bc", "c"]] 1408 ); 1409 testSummarizer(["unittest-mk-7", "-H", "--group-by", "2,1,3", "--values", "2", file1Path], 1410 file1, 1411 [["fld2", "fld1", "fld3", "fld2_values"], 1412 ["a", "a", "3", "a"], 1413 ["a", "c", "2b", "a"], 1414 ["bc", "c", "", "bc"], 1415 ["c", "a", "2b", "c"], 1416 ["bc", "", "", "bc"], 1417 ["bc", "c", "3", "bc"]] 1418 ); 1419 1420 /* Missing policies. */ 1421 testSummarizer(["unittest-mis-1", "--header", "--group-by", "1", "--values", "1", "--exclude-missing", file1Path], 1422 file1, 1423 [["fld1", "fld1_values"], 1424 ["a", "a|a"], 1425 ["c", "c|c|c"], 1426 ["", ""]] 1427 ); 1428 testSummarizer(["unittest-mis-2", "-H", "--group-by", "1", "--values", "2", "-x", file1Path], 1429 file1, 1430 [["fld1", "fld2_values"], 1431 ["a", "a|c"], 1432 ["c", "a|bc|bc"], 1433 ["", "bc"]] 1434 ); 1435 testSummarizer(["unittest-mis-3", "-H", "-g", "1", "--values", "3", "-x", file1Path], 1436 file1, 1437 [["fld1", "fld3_values"], 1438 ["a", "3|2b"], 1439 ["c", "2b|3"], 1440 ["", ""]] 1441 ); 1442 testSummarizer(["unittest-mis-4", "-H", "--group-by", "1", "--values", "1,2,3", "-x", file1Path], 1443 file1, 1444 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1445 ["a", "a|a", "a|c", "3|2b"], 1446 ["c", "c|c|c", "a|bc|bc", "2b|3"], 1447 ["", "", "bc", ""]] 1448 ); 1449 testSummarizer(["unittest-mis-5", "--header", "--group-by", "1", "--values", "1", "--replace-missing", "NA", file1Path], 1450 file1, 1451 [["fld1", "fld1_values"], 1452 ["a", "a|a"], 1453 ["c", "c|c|c"], 1454 ["", "NA"]] 1455 ); 1456 testSummarizer(["unittest-mis-6", "-H", "--group-by", "1", "--values", "2", "-r", "NA", file1Path], 1457 file1, 1458 [["fld1", "fld2_values"], 1459 ["a", "a|c"], 1460 ["c", "a|bc|bc"], 1461 ["", "bc"]] 1462 ); 1463 testSummarizer(["unittest-mis-7", "-H", "-g", "1", "--values", "3", "-r", "NA", file1Path], 1464 file1, 1465 [["fld1", "fld3_values"], 1466 ["a", "3|2b"], 1467 ["c", "2b|NA|3"], 1468 ["", "NA"]] 1469 ); 1470 testSummarizer(["unittest-mis-7-named", "-H", "-g", "fld1", "--values", "fld3", "-r", "NA", file1Path], 1471 file1, 1472 [["fld1", "fld3_values"], 1473 ["a", "3|2b"], 1474 ["c", "2b|NA|3"], 1475 ["", "NA"]] 1476 ); 1477 testSummarizer(["unittest-mis-8", "-H", "--group-by", "1", "--values", "1,2,3", "-r", "NA", file1Path], 1478 file1, 1479 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1480 ["a", "a|a", "a|c", "3|2b"], 1481 ["c", "c|c|c", "a|bc|bc", "2b|NA|3"], 1482 ["", "NA", "bc", "NA"]] 1483 ); 1484 testSummarizer(["unittest-mis-9", "-H", "--group-by", "1,2", "--values", "3,1", "-x", file1Path], 1485 file1, 1486 [["fld1", "fld2", "fld3_values", "fld1_values"], 1487 ["a", "a", "3", "a"], 1488 ["c", "a", "2b", "c"], 1489 ["c", "bc", "3", "c|c"], 1490 ["a", "c", "2b", "a"], 1491 ["", "bc", "", ""]] 1492 ); 1493 testSummarizer(["unittest-mis-10", "-H", "--group-by", "3,2", "--values", "1", "-x", file1Path], 1494 file1, 1495 [["fld3", "fld2", "fld1_values"], 1496 ["3", "a", "a"], 1497 ["2b", "a", "c"], 1498 ["", "bc", "c"], 1499 ["2b", "c", "a"], 1500 ["3", "bc", "c"]] 1501 ); 1502 testSummarizer(["unittest-mis-11", "-H", "--group-by", "2,1,3", "--values", "2", "-x", file1Path], 1503 file1, 1504 [["fld2", "fld1", "fld3", "fld2_values"], 1505 ["a", "a", "3", "a"], 1506 ["a", "c", "2b", "a"], 1507 ["bc", "c", "", "bc"], 1508 ["c", "a", "2b", "c"], 1509 ["bc", "", "", "bc"], 1510 ["bc", "c", "3", "bc"]] 1511 ); 1512 testSummarizer(["unittest-mis-12", "-H", "--group-by", "1,2", "--values", "3,1", "-r", "NA", file1Path], 1513 file1, 1514 [["fld1", "fld2", "fld3_values", "fld1_values"], 1515 ["a", "a", "3", "a"], 1516 ["c", "a", "2b", "c"], 1517 ["c", "bc", "NA|3", "c|c"], 1518 ["a", "c", "2b", "a"], 1519 ["", "bc", "NA", "NA"]] 1520 ); 1521 testSummarizer(["unittest-mis-13", "-H", "--group-by", "3,2", "--values", "1", "-r", "NA", file1Path], 1522 file1, 1523 [["fld3", "fld2", "fld1_values"], 1524 ["3", "a", "a"], 1525 ["2b", "a", "c"], 1526 ["", "bc", "c|NA"], 1527 ["2b", "c", "a"], 1528 ["3", "bc", "c"]] 1529 ); 1530 testSummarizer(["unittest-mis-14", "-H", "--group-by", "2,1,3", "--values", "2", "-r", "NA", file1Path], 1531 file1, 1532 [["fld2", "fld1", "fld3", "fld2_values"], 1533 ["a", "a", "3", "a"], 1534 ["a", "c", "2b", "a"], 1535 ["bc", "c", "", "bc"], 1536 ["c", "a", "2b", "c"], 1537 ["bc", "", "", "bc"], 1538 ["bc", "c", "3", "bc"]] 1539 ); 1540 1541 /* Validate that the no-key summarizer works with testSummarizer helper function. 1542 */ 1543 testSummarizer(["unittest-nk-1", "-H", "--values", "1,2", file1Path], 1544 file1, 1545 [["fld1_values", "fld2_values"], 1546 ["a|c|c|a||c", "a|a|bc|c|bc|bc"]] 1547 ); 1548 testSummarizer(["unittest-nk-1-named", "-H", "--values", "fld1,fld2", file1Path], 1549 file1, 1550 [["fld1_values", "fld2_values"], 1551 ["a|c|c|a||c", "a|a|bc|c|bc|bc"]] 1552 ); 1553 1554 /* Header variations: no header line; auto-generated header line; custom headers. 1555 */ 1556 testSummarizer(["unittest-hdr-1", "--group-by", "1", "--values", "1", file1NoHeaderPath], 1557 file1[1..$], 1558 [["a", "a|a"], 1559 ["c", "c|c|c"], 1560 ["", ""]] 1561 ); 1562 testSummarizer(["unittest-hdr-2", "--group-by", "1,2", "--values", "2", file1NoHeaderPath], 1563 file1[1..$], 1564 [["a", "a", "a"], 1565 ["c", "a", "a"], 1566 ["c", "bc", "bc|bc"], 1567 ["a", "c", "c"], 1568 ["", "bc", "bc"]] 1569 ); 1570 testSummarizer(["unittest-hdr-3", "--write-header", "--group-by", "2", "--values", "1", file1NoHeaderPath], 1571 file1[1..$], 1572 [["field2", "field1_values"], 1573 ["a", "a|c"], 1574 ["bc", "c||c"], 1575 ["c", "a"]] 1576 ); 1577 testSummarizer(["unittest-hdr-4", "-w", "--group-by", "3,2", "--values", "1", file1NoHeaderPath], 1578 file1[1..$], 1579 [["field3", "field2", "field1_values"], 1580 ["3", "a", "a"], 1581 ["2b", "a", "c"], 1582 ["", "bc", "c|"], 1583 ["2b", "c", "a"], 1584 ["3", "bc", "c"]] 1585 ); 1586 testSummarizer(["unittest-hdr-5", "-H", "--group-by", "2", "--values", "3:Field3Values", file1Path], 1587 file1, 1588 [["fld2", "Field3Values"], 1589 ["a", "3|2b"], 1590 ["bc", "||3"], 1591 ["c", "2b"]] 1592 ); 1593 testSummarizer(["unittest-hdr-6", "-H", "--group-by", "1,2", "--values", "3:FieldThreeValues", "--values", "1:FieldOneValues", file1Path], 1594 file1, 1595 [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"], 1596 ["a", "a", "3", "a"], 1597 ["c", "a", "2b", "c"], 1598 ["c", "bc", "|3", "c|c"], 1599 ["a", "c", "2b", "a"], 1600 ["", "bc", "", ""]] 1601 ); 1602 testSummarizer(["unittest-hdr-6-named-a", "-H", "--group-by", "fld1,fld2", "--values", "fld3:FieldThreeValues", "--values", "fld1:FieldOneValues", file1Path], 1603 file1, 1604 [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"], 1605 ["a", "a", "3", "a"], 1606 ["c", "a", "2b", "c"], 1607 ["c", "bc", "|3", "c|c"], 1608 ["a", "c", "2b", "a"], 1609 ["", "bc", "", ""]] 1610 ); 1611 testSummarizer(["unittest-hdr-6-named-b", "-H", "--group-by", "fld1,fld2", "--values", "fld3 FieldThreeValues", "--values", "fld1 FieldOneValues", file1Path], 1612 file1, 1613 [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"], 1614 ["a", "a", "3", "a"], 1615 ["c", "a", "2b", "c"], 1616 ["c", "bc", "|3", "c|c"], 1617 ["a", "c", "2b", "a"], 1618 ["", "bc", "", ""]] 1619 ); 1620 testSummarizer(["unittest-hdr-7", "--write-header", "--group-by", "1", "--values", "3:f3_vals","--values", "2:f2_vals", "--values", "1:f1_vals", file1NoHeaderPath], 1621 file1[1..$], 1622 [["field1", "f3_vals", "f2_vals", "f1_vals"], 1623 ["a", "3|2b", "a|c", "a|a"], 1624 ["c", "2b||3", "a|bc|bc", "c|c|c"], 1625 ["", "", "bc", ""]] 1626 ); 1627 testSummarizer(["unittest-hdr-8", "--write-header", "--group-by", "1,3,2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2", file1NoHeaderPath], 1628 file1[1..$], 1629 [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"], 1630 ["a", "3", "a", "3", "a", "a"], 1631 ["c", "2b", "a", "2b", "c", "a"], 1632 ["c", "", "bc", "", "c", "bc"], 1633 ["a", "2b", "c", "2b", "a", "c"], 1634 ["", "", "bc", "", "", "bc"], 1635 ["c", "3", "bc", "3", "c", "bc"]] 1636 ); 1637 testSummarizer(["unittest-hdr-9", "--write-header", "--group-by", "1,3-2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2", file1NoHeaderPath], 1638 file1[1..$], 1639 [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"], 1640 ["a", "3", "a", "3", "a", "a"], 1641 ["c", "2b", "a", "2b", "c", "a"], 1642 ["c", "", "bc", "", "c", "bc"], 1643 ["a", "2b", "c", "2b", "a", "c"], 1644 ["", "", "bc", "", "", "bc"], 1645 ["c", "3", "bc", "3", "c", "bc"]] 1646 ); 1647 1648 /* Alternate file widths and lengths. 1649 */ 1650 1651 auto file3x2 = [["fld1", "fld2", "fld3"], 1652 ["a", "b", "c"], 1653 ["c", "b", "a"]]; 1654 1655 auto file3x2Path = buildPath(testDir, "file3x2.tsv"); 1656 auto file3x2NoHeaderPath = buildPath(testDir, "file3x2_noheader.tsv"); 1657 writeDataFile(file3x2Path, file3x2); 1658 writeDataFile(file3x2NoHeaderPath, file3x2[1 .. $]); 1659 1660 testSummarizer(["unittest-3x2-1", "-H", "--group-by", "1", "--values", "3", file3x2Path], 1661 file3x2, 1662 [["fld1", "fld3_values"], 1663 ["a", "c"], 1664 ["c", "a"]] 1665 ); 1666 testSummarizer(["unittest-3x2-2", "-H", "--group-by", "2", "--values", "3", file3x2Path], 1667 file3x2, 1668 [["fld2", "fld3_values"], 1669 ["b", "c|a"]] 1670 ); 1671 testSummarizer(["unittest-3x2-3", "-H", "--group-by", "2,1", "--values", "3", file3x2Path], 1672 file3x2, 1673 [["fld2", "fld1", "fld3_values"], 1674 ["b", "a", "c"], 1675 ["b", "c", "a"]] 1676 ); 1677 1678 auto file3x1 = [["fld1", "fld2", "fld3"], 1679 ["a", "b", "c"]]; 1680 1681 auto file3x1Path = buildPath(testDir, "file3x1.tsv"); 1682 auto file3x1NoHeaderPath = buildPath(testDir, "file3x1_noheader.tsv"); 1683 writeDataFile(file3x1Path, file3x1); 1684 writeDataFile(file3x1NoHeaderPath, file3x1[1 .. $]); 1685 1686 testSummarizer(["unittest-3x1-1", "-H", "--group-by", "1", "--values", "3", file3x1Path], 1687 file3x1, 1688 [["fld1", "fld3_values"], 1689 ["a", "c"]] 1690 ); 1691 testSummarizer(["unittest-3x1-2", "--group-by", "1", "--values", "3", file3x1NoHeaderPath], 1692 file3x1[1..$], 1693 [["a", "c"]] 1694 ); 1695 testSummarizer(["unittest-3x1-3", "-H", "--group-by", "2,1", "--values", "3", file3x1Path], 1696 file3x1, 1697 [["fld2", "fld1", "fld3_values"], 1698 ["b", "a", "c"]] 1699 ); 1700 testSummarizer(["unittest-3x1-3-named", "-H", "--group-by", "fld2,fld1", "--values", "fld3", file3x1Path], 1701 file3x1, 1702 [["fld2", "fld1", "fld3_values"], 1703 ["b", "a", "c"]] 1704 ); 1705 testSummarizer(["unittest-3x1-4", "--group-by", "2,1", "--values", "3", file3x1NoHeaderPath], 1706 file3x1[1..$], 1707 [["b", "a", "c"]] 1708 ); 1709 1710 auto file3x0 = [["fld1", "fld2", "fld3"]]; 1711 1712 auto file3x0Path = buildPath(testDir, "file3x0.tsv"); 1713 auto file3x0NoHeaderPath = buildPath(testDir, "file3x0_noheader.tsv"); 1714 writeDataFile(file3x0Path, file3x0); 1715 writeDataFile(file3x0NoHeaderPath, file3x0[1 .. $]); 1716 1717 1718 testSummarizer(["unittest-3x0-1", "-H", "--group-by", "1", "--values", "3", file3x0Path], 1719 file3x0, 1720 [["fld1", "fld3_values"]] 1721 ); 1722 testSummarizer(["unittest-3x0-1-named", "-H", "--group-by", "fld1", "--values", "fld3", file3x0Path], 1723 file3x0, 1724 [["fld1", "fld3_values"]] 1725 ); 1726 testSummarizer(["unittest-3x0-2", "--group-by", "1", "--values", "3", file3x0NoHeaderPath], 1727 file3x0[1..$], 1728 [] 1729 ); 1730 testSummarizer(["unittest-3x0-3", "--write-header", "--group-by", "1", "--values", "3", file3x0NoHeaderPath], 1731 file3x0[1..$], 1732 [["field1", "field3_values"]] 1733 ); 1734 1735 1736 testSummarizer(["unittest-3x0-4", "-H", "--group-by", "2,1", "--values", "3", file3x0Path], 1737 file3x0, 1738 [["fld2", "fld1", "fld3_values"]] 1739 ); 1740 1741 testSummarizer(["unittest-3x0-5", "--group-by", "2,1", "--values", "3", file3x0NoHeaderPath], 1742 file3x0[1..$], 1743 [] 1744 ); 1745 1746 testSummarizer(["unittest-3x0-6", "--write-header", "--group-by", "2,1", "--values", "3", file3x0NoHeaderPath], 1747 file3x0[1..$], 1748 [["field2", "field1", "field3_values"]] 1749 ); 1750 1751 auto file2x1 = [["fld1", "fld2"], 1752 ["a", "b"]]; 1753 1754 auto file2x1Path = buildPath(testDir, "file2x1.tsv"); 1755 auto file2x1NoHeaderPath = buildPath(testDir, "file2x1_noheader.tsv"); 1756 writeDataFile(file2x1Path, file2x1); 1757 writeDataFile(file2x1NoHeaderPath, file2x1[1 .. $]); 1758 1759 testSummarizer(["unittest-2x1-1", "-H", "--group-by", "1", "--values", "2", file2x1Path], 1760 file2x1, 1761 [["fld1", "fld2_values"], 1762 ["a", "b"]] 1763 ); 1764 testSummarizer(["unittest-2x1-2", "-H", "--group-by", "2,1", "--values", "1", file2x1Path], 1765 file2x1, 1766 [["fld2", "fld1", "fld1_values"], 1767 ["b", "a", "a"]] 1768 ); 1769 1770 auto file2x0 = [["fld1", "fld2"]]; 1771 1772 auto file2x0Path = buildPath(testDir, "file2x0.tsv"); 1773 auto file2x0NoHeaderPath = buildPath(testDir, "file2x0_noheader.tsv"); 1774 writeDataFile(file2x0Path, file2x0); 1775 writeDataFile(file2x0NoHeaderPath, file2x0[1 .. $]); 1776 1777 testSummarizer(["unittest-2x0-1", "-H", "--group-by", "1", "--values", "2", file2x0Path], 1778 file2x0, 1779 [["fld1", "fld2_values"]] 1780 ); 1781 testSummarizer(["unittest-2x0-2", "-H", "--group-by", "2,1", "--values", "1", file2x0Path], 1782 file2x0, 1783 [["fld2", "fld1", "fld1_values"]] 1784 ); 1785 1786 auto file1x2 = [["fld1"], 1787 ["a"], 1788 [""]]; 1789 1790 auto file1x2Path = buildPath(testDir, "file1x2.tsv"); 1791 auto file1x2NoHeaderPath = buildPath(testDir, "file1x2_noheader.tsv"); 1792 writeDataFile(file1x2Path, file1x2); 1793 writeDataFile(file1x2NoHeaderPath, file1x2[1 .. $]); 1794 1795 testSummarizer(["unittest-1x2-1", "-H", "--group-by", "1", "--values", "1", file1x2Path], 1796 file1x2, 1797 [["fld1", "fld1_values"], 1798 ["a", "a"], 1799 ["", ""]] 1800 ); 1801 1802 auto file1x2b = [["fld1"], 1803 [""], 1804 [""]]; 1805 1806 auto file1x2bPath = buildPath(testDir, "file1x2b.tsv"); 1807 auto file1x2bNoHeaderPath = buildPath(testDir, "file1x2b_noheader.tsv"); 1808 writeDataFile(file1x2bPath, file1x2b); 1809 writeDataFile(file1x2bNoHeaderPath, file1x2b[1 .. $]); 1810 1811 testSummarizer(["unittest-1x2b-2", "-H", "--group-by", "1", "--values", "1", file1x2bPath], 1812 file1x2b, 1813 [["fld1", "fld1_values"], 1814 ["", "|"]] 1815 ); 1816 1817 auto file1x1 = [["fld1"], 1818 ["x"]]; 1819 1820 auto file1x1Path = buildPath(testDir, "file1x1.tsv"); 1821 auto file1x1NoHeaderPath = buildPath(testDir, "file1x1_noheader.tsv"); 1822 writeDataFile(file1x1Path, file1x1); 1823 writeDataFile(file1x1NoHeaderPath, file1x1[1 .. $]); 1824 1825 testSummarizer(["unittest-1x1-1", "-H", "--group-by", "1", "--values", "1", file1x1Path], 1826 file1x1, 1827 [["fld1", "fld1_values"], 1828 ["x", "x"]] 1829 ); 1830 testSummarizer(["unittest-1x1-1-named", "-H", "--group-by", "fld1", "--values", "fld1", file1x1Path], 1831 file1x1, 1832 [["fld1", "fld1_values"], 1833 ["x", "x"]] 1834 ); 1835 1836 testSummarizer(["unittest-1x1-2", "--group-by", "1", "--values", "1", file1x1NoHeaderPath], 1837 file1x1[1..$], 1838 [["x", "x"]] 1839 ); 1840 1841 testSummarizer(["unittest-1x1-3", "--write-header", "--group-by", "1", "--values", "1", file1x1NoHeaderPath], 1842 file1x1[1..$], 1843 [["field1", "field1_values"], 1844 ["x", "x"]] 1845 ); 1846 1847 auto file1x1b = [["fld1"], 1848 [""]]; 1849 1850 auto file1x1bPath = buildPath(testDir, "file1x1b.tsv"); 1851 auto file1x1bNoHeaderPath = buildPath(testDir, "file1x1b_noheader.tsv"); 1852 writeDataFile(file1x1bPath, file1x1b); 1853 writeDataFile(file1x1bNoHeaderPath, file1x1b[1 .. $]); 1854 1855 testSummarizer(["unittest-1x1b-1", "-H", "--group-by", "1", "--values", "1", file1x1bPath], 1856 file1x1b, 1857 [["fld1", "fld1_values"], 1858 ["", ""]] 1859 ); 1860 1861 auto file1x0 = [["fld1"]]; 1862 1863 auto file1x0Path = buildPath(testDir, "file1x0.tsv"); 1864 auto file1x0NoHeaderPath = buildPath(testDir, "file1x0_noheader.tsv"); 1865 writeDataFile(file1x0Path, file1x0); 1866 writeDataFile(file1x0NoHeaderPath, file1x0[1 .. $]); 1867 1868 testSummarizer(["unittest-1x0-1", "-H", "--group-by", "1", "--values", "1", file1x0Path], 1869 file1x0, 1870 [["fld1", "fld1_values"]] 1871 ); 1872 1873 testSummarizer(["unittest-1x0-2", "--group-by", "1", "--values", "1", file1x0NoHeaderPath], 1874 file1x0[1..$], 1875 [] 1876 ); 1877 1878 testSummarizer(["unittest-1x0-3", "--write-header", "--group-by", "1", "--values", "1", file1x0NoHeaderPath], 1879 file1x0[1..$], 1880 [["field1", "field1_values"]] 1881 ); 1882 1883 /* Alternate delimiters. 1884 * 1885 * Note: In current unit test setup the data is already in memory (file1). 1886 * 'file1Path' points to a file with equivalent data, but not read, except if 1887 * processing the header line. A data file is created for the '%' and '#' 1888 * delimiter cases (these read the header), but we don't bother for the others. 1889 */ 1890 auto file1PctDelimPath = buildPath(testDir, "file1PctDelim.tsv"); 1891 auto file1HashDelimPath = buildPath(testDir, "file1HashDelim.tsv"); 1892 writeDataFile(file1PctDelimPath, file1, "%"); 1893 writeDataFile(file1HashDelimPath, file1, "#"); 1894 1895 testSummarizer(["unittest-delim-1", "-H", "--values", "1,2", "--delimiter", "%", file1PctDelimPath], 1896 file1, 1897 [["fld1_values", "fld2_values"], 1898 ["a|c|c|a||c", "a|a|bc|c|bc|bc"]] 1899 ); 1900 testSummarizer(["unittest-delim-1-named", "-H", "--values", "fld1,fld2", "--delimiter", "%", file1PctDelimPath], 1901 file1, 1902 [["fld1_values", "fld2_values"], 1903 ["a|c|c|a||c", "a|a|bc|c|bc|bc"]] 1904 ); 1905 testSummarizer(["unittest-delim-2", "-H", "--values", "1-2", "--values-delimiter", "$", file1Path], 1906 file1, 1907 [["fld1_values", "fld2_values"], 1908 ["a$c$c$a$$c", "a$a$bc$c$bc$bc"]] 1909 ); 1910 testSummarizer(["unittest-delim-3", "-H", "--values", "1,2", "--delimiter", "#", "--values-delimiter", ",", file1HashDelimPath], 1911 file1, 1912 [["fld1_values", "fld2_values"], 1913 ["a,c,c,a,,c", "a,a,bc,c,bc,bc"]] 1914 ); 1915 testSummarizer(["unittest-delim-3-named", "-H", "--values", "fld1,fld2", "--delimiter", "#", "--values-delimiter", ",", file1HashDelimPath], 1916 file1, 1917 [["fld1_values", "fld2_values"], 1918 ["a,c,c,a,,c", "a,a,bc,c,bc,bc"]] 1919 ); 1920 testSummarizer(["unittest-delim-4", "--write-header", "--group-by", "2", "--values", "1", 1921 "--delimiter", "^", "--values-delimiter", ":", file1NoHeaderPath], 1922 file1[1..$], 1923 [["field2", "field1_values"], 1924 ["a", "a:c"], 1925 ["bc", "c::c"], 1926 ["c", "a"]] 1927 ); 1928 testSummarizer(["unittest-delim-5", "--group-by", "1,2", "--values", "2", "--delimiter", "/", 1929 "--values-delimiter", "\\", file1NoHeaderPath], 1930 file1[1..$], 1931 [["a", "a", "a"], 1932 ["c", "a", "a"], 1933 ["c", "bc", "bc\\bc"], 1934 ["a", "c", "c"], 1935 ["", "bc", "bc"]] 1936 ); 1937 } 1938 1939 /* Summary Operators and Calculators 1940 * 1941 * Two types of objects are used in implementation: Operators and Calculators. An Operator 1942 * represents a summary calculation specified on the command line, e.g. '--mean 5'. A 1943 * Calculator is used to manage the summary calculation for each unique key in the input. 1944 * 1945 * As an example, consider the command: 1946 * 1947 * $tsv-summarize --group-by 1 --mean 3 --mean 5 1948 * 1949 * This command will create two instances of a MeanOperator, one each for fields 3 and 5. 1950 * They produce the output field headers (e.g. "field3_mean", "field5_mean"). They also 1951 * create MeanCalculator objects for each unique value in field 1. For 'mean', a 1952 * calculator needs to track occurrence count and sum. Calculators produce the final 1953 * value when all processing is finished. 1954 * 1955 * Summary field headers 1956 * 1957 * There are several options for specifying summary field headers. The defaults combine the 1958 * operator name and the header of the field summarized. The defaults can be overridden on 1959 * on the command line. These scenarios are supported via the operator constructor and the 1960 * processHeaderLine() method. 1961 * 1962 * Missing field policy 1963 * 1964 * At present, tsv-summarize has a single policy for handling missing values that applies 1965 * to all operators. However, it is logically operator specific and is implemented that 1966 * way. The MissingFieldPolicy struct describes the policy, each operator contains one. 1967 * Calculators access thier operator's policy struct. 1968 */ 1969 1970 /** An Operator represents a summary calculation specified on the command line. 1971 * e.g. '--mean 5'. 1972 */ 1973 interface Operator 1974 { 1975 @property string header(); 1976 @property string name(); 1977 void processHeaderLine(const char[][] fields); 1978 size_t[] numericFieldsToSave(); // Numeric fields this Operator needs saved 1979 size_t[] textFieldsToSave(); // Text fields this Operator needs saved 1980 Calculator makeCalculator(); 1981 } 1982 1983 /** Calculators are responsible for the calculation of a single computation. They 1984 * process each line and produce the final value when all processing is finished. 1985 */ 1986 interface Calculator 1987 { 1988 void processNextLine(const char[][] fields); 1989 string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions); 1990 } 1991 1992 /** This class describes processing behavior when a missing value is encountered. 1993 */ 1994 final class MissingFieldPolicy 1995 { 1996 private bool _useMissing = true; // True if missing values are processed unchanged. 1997 private bool _replaceMissing = false; // True if missing values are replaced. 1998 private string _missingReplacement; // Replacement string if replaceMissing is true. 1999 2000 this (const bool excludeMissing = false, string missingReplacement = "") 2001 { 2002 updatePolicy(excludeMissing, missingReplacement); 2003 } 2004 2005 void updatePolicy(const bool excludeMissing, string missingReplacement) 2006 { 2007 _missingReplacement = missingReplacement; 2008 _replaceMissing = missingReplacement.length != 0; 2009 _useMissing = !excludeMissing && !replaceMissing; 2010 } 2011 2012 final bool isMissingField(const char[] field) const 2013 { 2014 return field.length == 0; 2015 } 2016 2017 final bool useMissing() const @property 2018 { 2019 return _useMissing; 2020 } 2021 2022 final bool excludeMissing() const @property 2023 { 2024 return !_useMissing && !_replaceMissing; 2025 } 2026 2027 final bool replaceMissing() const @property 2028 { 2029 return _replaceMissing; 2030 } 2031 2032 final string missingReplacement() const @property 2033 { 2034 return _missingReplacement; 2035 } 2036 } 2037 2038 /* The SharedFieldValues and UniqueKeyValuesLists classes manage lists of values collected 2039 * while reading data. Operations like median collect all values and operate on them when 2040 * running the final calculation. Value lists are needed for each unique key. A command 2041 * using multiple Operators may save multiple fields. And, different Operators may be run 2042 * against the same field. 2043 * 2044 * The last part motivates these classes. Handling large data sets necessitates minimizing 2045 * in-memory storage, making it desirable to share identical lists between Calculators. 2046 * Otherwise, each Calculator could implement its own storage, which would be simpler. 2047 * 2048 * The setup works as follows: 2049 * - Operators advertise fields they need saved ([text|numeric]FieldsToSave methods). 2050 * - The SummarizerBase object keeps a SharedFieldValues object, which in turn keeps list 2051 * of the fields advertised by Operators as needing sharing. This list gets created 2052 * during command initialization (SummarizerBase.setOperators). 2053 * - The SharedFieldValues object is used to create a UniqueKeyValuesLists object every 2054 * time a new unique key is found, in parellel to the Calculator objects created for the 2055 * key. The UniqueKeyValuesLists objects are managed by derived Summarizer classes. 2056 * - A unique key's UniqueKeyValuesLists object is passed each input line, same as 2057 * Calculators, saving the values. 2058 * - Calculators retrieve the saved values during the calculation phase. The calculator's 2059 * ProcessNextField method is typically a no-op. 2060 * - Calculators cannot make assumptions about the order of the saved values. This is 2061 * pragmatic concession to median and quantile calculations, which need to sort the data, 2062 * at least partially. Rather than generate sorted copies, the current algorithms 2063 * sort the data in place. 2064 * 2065 * One concession to duplicate storage is that text and numeric versions of the same 2066 * field might be stored. The reason is because it's important to convert text to numbers 2067 * as they are read so that useful error messages can be generated. And, storing both 2068 * forms of the same field should be less common. 2069 * 2070 * The current implementation uses the same missing values policy for all fields. If 2071 * multiple policies become supported this will need to change. 2072 * 2073 * Built-in calculations - UniqueKeyValueLists have a built-in median operation. This is 2074 * to avoid repeated calculations of the median by different calculations. 2075 */ 2076 2077 final class SharedFieldValues 2078 { 2079 // Arrays with field indices that need to be saved. 2080 private size_t[] _numericFieldIndices; 2081 private size_t[] _textFieldIndices; 2082 2083 /* Called during summarizer setup to add a shared field value for a specific field index. 2084 * eg. '--median 7' will add end up calling addNumericIdex(6), 6 being the zero-based index. 2085 * A specific index is only added once. 2086 */ 2087 final void addNumericIndex (size_t index) 2088 { 2089 if (!canFind(_numericFieldIndices, index)) _numericFieldIndices ~= index; 2090 } 2091 2092 /* Similar to addNumericIndex, except adds a text index. */ 2093 final void addTextIndex (size_t index) 2094 { 2095 if (!canFind(_textFieldIndices, index)) _textFieldIndices ~= index; 2096 } 2097 2098 /* Called every time a new key is found, or once at the beginning of the program if no keys 2099 * are being used (entire column summarized). 2100 */ 2101 final UniqueKeyValuesLists makeUniqueKeyValuesLists() 2102 { 2103 return new UniqueKeyValuesLists(_numericFieldIndices, _textFieldIndices); 2104 } 2105 } 2106 2107 final class UniqueKeyValuesLists 2108 { 2109 /* A FieldValues object holds is a list of values collect for a specific field. A 2110 * unique key may hold several. For example, the command: 2111 * $ tsv-summarize --k 1 --median 4 -- median 5 2112 * requires keeping lists for both fields 4 and 5. This in turn will result in a 2113 * _numericFieldValues being a 2 element array, one with a list of field 4 values, 2114 * the second of field 5 values. Linear search is used to find a specific field. 2115 */ 2116 private FieldValues!double[] _numericFieldValues; 2117 private FieldValues!string[] _textFieldValues; 2118 private double[] _numericFieldMedians; 2119 2120 /* The UniqueKeyValuesLists constructor takes arrays of field indices to be saved. */ 2121 this(const size_t[] numericFieldIndices, const size_t[] textFieldIndices) 2122 { 2123 if (numericFieldIndices.length > 0) 2124 { 2125 _numericFieldValues = new FieldValues!double[](numericFieldIndices.length); 2126 foreach (i, fieldIndex; numericFieldIndices) 2127 _numericFieldValues[i] = new FieldValues!double(fieldIndex); 2128 } 2129 2130 if (textFieldIndices.length > 0) 2131 { 2132 _textFieldValues = new FieldValues!string[](textFieldIndices.length); 2133 foreach (i, fieldIndex; textFieldIndices) 2134 _textFieldValues[i] = new FieldValues!string(fieldIndex); 2135 } 2136 } 2137 2138 void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy) 2139 { 2140 _numericFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy)); 2141 _textFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy)); 2142 } 2143 2144 private FieldValues!double findNumericFieldValues(size_t index) 2145 { 2146 alias pred = (FieldValues!double a, size_t b) => (a.fieldIndex == b); 2147 auto r = find!pred(_numericFieldValues, index); 2148 assert(!r.empty); 2149 return r.front; 2150 } 2151 2152 private FieldValues!string findTextFieldValues(size_t index) 2153 { 2154 alias pred = (FieldValues!string a, size_t b) => (a.fieldIndex == b); 2155 auto r = find!pred(_textFieldValues, index); 2156 assert(!r.empty); 2157 return r.front; 2158 } 2159 2160 final double[] numericValues(size_t index) 2161 { 2162 return findNumericFieldValues(index).getArray; 2163 } 2164 2165 final double[] numericValuesSorted(size_t index) 2166 { 2167 return findNumericFieldValues(index).getSortedArray; 2168 } 2169 2170 final string[] textValues(size_t index) 2171 { 2172 return findTextFieldValues(index).getArray; 2173 } 2174 2175 final string[] textValuesSorted(size_t index) 2176 { 2177 return findTextFieldValues(index).getSortedArray; 2178 } 2179 2180 final double numericValuesMedian(size_t index) 2181 { 2182 return findNumericFieldValues(index).median; 2183 } 2184 2185 private final class FieldValues(ValueType) 2186 { 2187 import std.array : appender; 2188 private size_t _fieldIndex; 2189 private Appender!(ValueType[]) _values; 2190 private bool _haveMedian = false; 2191 private bool _isSorted = false; 2192 private ValueType _medianValue; 2193 2194 this(size_t fieldIndex) 2195 { 2196 _fieldIndex = fieldIndex; 2197 } 2198 2199 final size_t length() const @property 2200 { 2201 return _values.data.length; 2202 } 2203 2204 final size_t fieldIndex() const @property 2205 { 2206 return _fieldIndex; 2207 } 2208 2209 final void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy) 2210 { 2211 debug writefln("[%s]: %s", __FUNCTION__, fields.to!string); 2212 2213 const char[] field = fields[_fieldIndex]; 2214 if (missingPolicy.useMissing || !missingPolicy.isMissingField(field)) 2215 { 2216 _values.put(field.to!ValueType); 2217 _haveMedian = false; 2218 _isSorted = false; 2219 } 2220 else if (missingPolicy.replaceMissing) 2221 { 2222 _values.put(missingPolicy.missingReplacement.to!ValueType); 2223 _haveMedian = false; 2224 _isSorted = false; 2225 } 2226 } 2227 2228 /* Return an input range of the values. */ 2229 final auto values() 2230 { 2231 return _values.data; 2232 } 2233 2234 final ValueType[] getArray() 2235 { 2236 return _values.data; 2237 } 2238 2239 final ValueType[] getSortedArray() 2240 { 2241 if (!_isSorted) 2242 { 2243 import std.algorithm : sort; 2244 sort(_values.data); 2245 _isSorted = true; 2246 } 2247 return _values.data; 2248 } 2249 2250 final ValueType median() 2251 { 2252 if (!_haveMedian) 2253 { 2254 import tsv_utils.common.numerics : rangeMedian; 2255 _medianValue = _values.data.rangeMedian(); 2256 _haveMedian = true; 2257 } 2258 2259 return _medianValue; 2260 } 2261 } 2262 } 2263 2264 /** SingleFieldOperator is a base class for single field operators, the most common 2265 * Operator. Derived classes implement makeCalculator and the Calculator class it returns. 2266 */ 2267 class SingleFieldOperator : Operator 2268 { 2269 import std.typecons : Flag; 2270 2271 private string _name; 2272 private string _header; 2273 private size_t _fieldIndex; 2274 private bool _useHeaderSuffix; 2275 private bool _allowCustomHeader; 2276 private bool _hasCustomHeader = false; 2277 private size_t[] _numericFieldsToSave; 2278 private size_t[] _textFieldsToSave; 2279 private MissingFieldPolicy _missingPolicy; 2280 2281 this(string operatorName, size_t fieldIndex, MissingFieldPolicy missingPolicy, 2282 Flag!"useHeaderSuffix" useHeaderSuffix = Yes.useHeaderSuffix, 2283 Flag!"allowCustomHeader" allowCustomHeader = Yes.allowCustomHeader) 2284 { 2285 _name = operatorName; 2286 _fieldIndex = fieldIndex; 2287 _missingPolicy = missingPolicy; 2288 _useHeaderSuffix = useHeaderSuffix; 2289 _allowCustomHeader = allowCustomHeader; 2290 // Default header. May be overrridden by custom header or header line. 2291 _header = 2292 fieldHeaderFromIndex(fieldIndex) 2293 .summaryHeaderFromFieldHeader(_useHeaderSuffix ? operatorName : ""); 2294 } 2295 2296 void setCustomHeader (string customHeader) 2297 { 2298 assert(_allowCustomHeader); 2299 _header = customHeader; 2300 _hasCustomHeader = true; 2301 } 2302 2303 final string name() const @property 2304 { 2305 return _name; 2306 } 2307 2308 final bool allowCustomHeader() const @property 2309 { 2310 return _allowCustomHeader; 2311 } 2312 2313 /* saveFieldValues[Numeric|Text] are called by derived classes to indicate that field 2314 * that the field values should be saved. These should called during construction. 2315 */ 2316 final void setSaveFieldValuesNumeric() 2317 { 2318 _numericFieldsToSave ~= _fieldIndex; 2319 } 2320 2321 final void setSaveFieldValuesText() 2322 { 2323 _textFieldsToSave ~= _fieldIndex; 2324 } 2325 2326 final MissingFieldPolicy missingPolicy() @property 2327 { 2328 return _missingPolicy; 2329 } 2330 2331 final size_t fieldIndex() const @property 2332 { 2333 return _fieldIndex; 2334 } 2335 2336 final string header() const @property 2337 { 2338 return _header; 2339 } 2340 2341 final bool useHeaderSuffix() const @property 2342 { 2343 return _useHeaderSuffix; 2344 } 2345 2346 void processHeaderLine(const char[][] fields) 2347 { 2348 if (!_hasCustomHeader) { 2349 debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string); 2350 _header = summaryHeaderFromFieldHeader(fields[_fieldIndex].to!string, 2351 _useHeaderSuffix ? _name : ""); 2352 } 2353 } 2354 2355 final size_t[] numericFieldsToSave() 2356 { 2357 return _numericFieldsToSave; 2358 } 2359 2360 final size_t[] textFieldsToSave() 2361 { 2362 return _textFieldsToSave; 2363 } 2364 2365 abstract SingleFieldCalculator makeCalculator(); 2366 } 2367 2368 /** SingleFieldCalculator is a base class for the common case of calculators using a single 2369 * field. Derived classes implement processNextField() rather than processNextLine(). 2370 */ 2371 class SingleFieldCalculator : Calculator 2372 { 2373 private size_t _fieldIndex; 2374 2375 this(size_t fieldIndex) 2376 { 2377 _fieldIndex = fieldIndex; 2378 } 2379 2380 final size_t fieldIndex() const @property 2381 { 2382 return _fieldIndex; 2383 } 2384 2385 final void processNextLine(const char[][] fields) 2386 { 2387 debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string); 2388 2389 auto missingPolicy = getOperator.missingPolicy; 2390 const char[] field = fields[_fieldIndex]; 2391 2392 if (missingPolicy.useMissing || !missingPolicy.isMissingField(field)) 2393 { 2394 processNextField(field); 2395 } 2396 else if (missingPolicy.replaceMissing) 2397 { 2398 processNextField(missingPolicy.missingReplacement); 2399 } 2400 } 2401 2402 abstract SingleFieldOperator getOperator(); 2403 2404 abstract void processNextField(const char[] field); 2405 } 2406 2407 /* Unittest helper functions. Only compiled when -unittest is in effect. */ 2408 version(unittest) 2409 { 2410 /** A helper for SingleFieldOperator unit tests. 2411 * 2412 * testSingleFieldOperator takes a set of split file values, a field index, a header 2413 * suffix, and a set of expected values. The expected values array contains the 2414 * initial value (zero entries) and the expected values after each line. (One more 2415 * expected value than input lines.) The zero entry case is what is generated for an 2416 * empty file. An example testing the 'min' operator against a file with 2 columns, 2417 * 3 rows, using field index 1: 2418 * 2419 * testSingleFieldOperator!MinOperator( 2420 * [["10", "100"], // The split file. 3 lines by 2 rows. 2421 * ["5", "50"], 2422 * ["20", "200"]], 2423 * 1, // Field index (zero-based, so "100", "50", "200") 2424 * "min", // The header suffix, normally the operator name. 2425 * ["nan", "100", "50", "50"]); // Min value after processing each line. 2426 * 2427 * A typical operator unit test uses three "files", one each of 1x3, 2x3, and 3x3. 2428 * Then run the operator is tested against each column, a total of six calls. Headers 2429 * are automatically checked. Additional entries can be used to extend coverage. 2430 * 2431 * A non-default MissingFieldPolicy can be provide as an optional last argument. 2432 * Operator tests should include exclusion and replacement variations. See operator 2433 * unit tests for details. 2434 * 2435 * The testSingleFieldOperatorBase adds an additional capability - Custom operator 2436 * init arguments. Currently this is used only by the quantile operator. 2437 * 2438 * These tests do not check unique key behavior (group-by). Operators don't have info 2439 * about unique keys, and interact with them only indirectly, via Calculators. 2440 */ 2441 void testSingleFieldOperator(OperatorClass : SingleFieldOperator) 2442 (const char[][][] splitFile, size_t fieldIndex, string headerSuffix, 2443 const char[][] expectedValues, 2444 MissingFieldPolicy missingPolicy = new MissingFieldPolicy) 2445 { 2446 testSingleFieldOperatorBase!OperatorClass(splitFile, fieldIndex, headerSuffix, expectedValues, missingPolicy); 2447 } 2448 2449 void testSingleFieldOperatorBase(OperatorClass : SingleFieldOperator, T...) 2450 (const char[][][] splitFile, size_t fieldIndex, string headerSuffix, 2451 const char[][] expectedValues, 2452 MissingFieldPolicy missingPolicy, 2453 T extraOpInitArgs) 2454 { 2455 import std.format : format; 2456 import std.array : appender; 2457 import std..string : chomp; 2458 import std.traits : EnumMembers; 2459 2460 auto numFields = (splitFile[0]).length; 2461 2462 assert(fieldIndex < numFields, 2463 format("[testSingleFieldOperator] Invalid field index. headerSuffix: %s", 2464 headerSuffix)); 2465 assert(splitFile.length + 1 == expectedValues.length, 2466 format("[testSingleFieldOperator] Need one more expected value than number of rows. headerSuffix: %s", 2467 headerSuffix)); 2468 2469 /* printOptions - Only the 'values-delimiter' (2nd arg) is used these tests. */ 2470 auto printOptions = SummarizerPrintOptions('#', '|'); 2471 2472 /* An input header line. */ 2473 string[] inputHeaderLine = new string[numFields]; 2474 foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string; 2475 2476 /* The different expected output field headers. */ 2477 auto outputFieldHeaderWithNoHeaderLine = 2478 fieldHeaderFromIndex(fieldIndex) 2479 .summaryHeaderFromFieldHeader(headerSuffix); 2480 auto outputFieldHeaderFromHeaderLine = 2481 inputHeaderLine[fieldIndex] 2482 .summaryHeaderFromFieldHeader(headerSuffix); 2483 auto customOutputFieldHeader = "custom"; 2484 2485 enum HeaderUsecase { 2486 HeaderLine_DefaultHeader, 2487 HeaderLine_CustomHeader, 2488 NoHeaderLine_DefaultHeader, 2489 NoHeaderLine_CustomHeader, 2490 NoHeaderLine_NoOutputHeader, 2491 } 2492 2493 string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected) 2494 { 2495 return format("[testSingleFieldOperator] Unexpected header. Operator: %s; Usecase: %s; Actual: '%s'; Expected: '%s'", 2496 op.name, hc, actual, expected); 2497 } 2498 2499 string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex, size_t fieldIndex, 2500 const char[] actual, const char[] expected) 2501 { 2502 return format("[testSingleFieldOperator] Unexpected value. Operator: %s; Usecase: %s; RowIndex: %d, FieldIndex: %d\n Actual: '%s'; Expected: '%s'", 2503 op.name, hc, rowIndex, fieldIndex, actual, expected); 2504 } 2505 2506 /* Run the logic for each header use case. */ 2507 foreach (hc; EnumMembers!HeaderUsecase) 2508 { 2509 bool hasInputHeader = ( 2510 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2511 hc == HeaderUsecase.HeaderLine_CustomHeader 2512 ); 2513 bool hasOutputHeader = ( 2514 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2515 hc == HeaderUsecase.HeaderLine_CustomHeader || 2516 hc == HeaderUsecase.NoHeaderLine_DefaultHeader || 2517 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2518 ); 2519 bool hasCustomHeader = ( 2520 hc == HeaderUsecase.HeaderLine_CustomHeader || 2521 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2522 ); 2523 2524 if (hasCustomHeader) assert(hasOutputHeader); 2525 2526 auto op = new OperatorClass(fieldIndex, missingPolicy, extraOpInitArgs); 2527 2528 if (hasCustomHeader) 2529 { 2530 if (!op.allowCustomHeader) continue; // Custom header not support by this operator 2531 op.setCustomHeader(customOutputFieldHeader); 2532 } 2533 2534 Operator[] operatorArray; 2535 operatorArray ~= op; 2536 2537 auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy); 2538 summarizer.setOperators(inputRangeObject(operatorArray)); 2539 2540 if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine); 2541 2542 if (hasOutputHeader) 2543 { 2544 /* Write the header line. Note that this is a one-field header, */ 2545 auto headerLineOutput = appender!(char[])(); 2546 summarizer.writeSummaryHeader(headerLineOutput, printOptions); 2547 2548 /* Test that the header was generated correctly. 2549 * 2550 * Note: Because the output is generated by a Summarizer, it will have a 2551 * trailing newline. Use chomp to trim it. 2552 */ 2553 final switch (hc) 2554 { 2555 case HeaderUsecase.HeaderLine_DefaultHeader: 2556 assert(headerLineOutput.data.chomp == outputFieldHeaderFromHeaderLine, 2557 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2558 outputFieldHeaderFromHeaderLine)); 2559 break; 2560 case HeaderUsecase.NoHeaderLine_DefaultHeader: 2561 assert(headerLineOutput.data.chomp == outputFieldHeaderWithNoHeaderLine, 2562 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2563 outputFieldHeaderWithNoHeaderLine)); 2564 break; 2565 case HeaderUsecase.HeaderLine_CustomHeader: 2566 case HeaderUsecase.NoHeaderLine_CustomHeader: 2567 assert(headerLineOutput.data.chomp == customOutputFieldHeader, 2568 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2569 customOutputFieldHeader)); 2570 break; 2571 case HeaderUsecase.NoHeaderLine_NoOutputHeader: 2572 break; 2573 } 2574 2575 } 2576 2577 /* For each line, process the line, generate the output, and test that the 2578 * value is correct. Start with the empty file case. 2579 */ 2580 foreach (i, const char[] expected; expectedValues) 2581 { 2582 if (i > 0) summarizer.processNextLine(splitFile[i - 1]); 2583 auto summaryLineOutput = appender!(char[])(); 2584 summarizer.writeSummaryBody(summaryLineOutput, printOptions); 2585 assert(summaryLineOutput.data.chomp == expected, 2586 valueAssertMessage(operatorArray[0], hc, i, fieldIndex, 2587 summaryLineOutput.data.chomp, expectedValues[i])); 2588 } 2589 } 2590 } 2591 } 2592 2593 /** ZeroFieldOperator is a base class for operators that take no input. The main use 2594 * case is the CountOperator, which counts the occurrences of each unique key. Other 2595 * uses are possible, for example, weighted random number assignment. 2596 * 2597 * The primary rationale for ZeroFieldOperator and ZeroFieldCalculator is to clarify 2598 * the information available to such a routine. In particular, the split fields passed 2599 * to processHeaderLine and processNextLine don't include all fields in the input, 2600 * something that might not be obvious when implementing an operator. (Only fields 2601 * required by operators acting on specific fields are included.) 2602 */ 2603 class ZeroFieldOperator : Operator 2604 { 2605 import std.typecons : Flag; 2606 2607 private string _name; 2608 private string _header; 2609 2610 this(string operatorName) 2611 { 2612 _name = operatorName; 2613 _header = operatorName; 2614 } 2615 2616 void setCustomHeader (string customHeader) 2617 { 2618 _header = customHeader; 2619 } 2620 2621 bool allowCustomHeader() const @property 2622 { 2623 return true; 2624 } 2625 2626 final string name() const @property 2627 { 2628 return _name; 2629 } 2630 2631 final string header() const @property 2632 { 2633 return _header; 2634 } 2635 2636 /* A no-op. ZeroFieldOperators have no access to the header line. */ 2637 final void processHeaderLine(const char[][] fields) { } 2638 2639 /* A no-op. ZeroFieldOperators have no access to fields. */ 2640 final size_t[] numericFieldsToSave() 2641 { 2642 size_t[] emptyArray; 2643 return emptyArray; 2644 } 2645 2646 /* A no-op. ZeroFieldOperators have no access to fields. */ 2647 final size_t[] textFieldsToSave() 2648 { 2649 size_t[] emptyArray; 2650 return emptyArray; 2651 } 2652 2653 abstract ZeroFieldCalculator makeCalculator(); 2654 } 2655 2656 /** ZeroFieldCalculator is a base class for operators that don't use fields as input. 2657 * In particular, the Count operator. It is a companion to the ZeroFieldOperator class. 2658 * 2659 * Derived classes implement processNextEntry() rather than processNextLine(), and the 2660 * single argument form of calculate() given as an abstract function. 2661 */ 2662 class ZeroFieldCalculator : Calculator 2663 { 2664 this() { } 2665 2666 final void processNextLine(const char[][] fields) 2667 { 2668 debug writefln("[%s]", __FUNCTION__,); 2669 processNextEntry(); 2670 } 2671 2672 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2673 { 2674 return calculate(printOptions); 2675 } 2676 2677 abstract void processNextEntry(); 2678 abstract string calculate(const ref SummarizerPrintOptions printOptions); 2679 } 2680 2681 version(unittest) 2682 { 2683 /* A helper for ZeroFieldOperator unit tests. 2684 * 2685 * testZeroFieldOperator takes a set of split file values, a default header, and a 2686 * set of expected values. The expected values array contains the expected values 2687 * after each line. 2688 * 2689 * testZeroFieldOperator is very similar to testSingleFieldOperator, except that 2690 * there is no use of field indices and fewer types of headers. See the latter's 2691 * documentation and the CountOperator unit tests for examples. 2692 */ 2693 void testZeroFieldOperator(OperatorClass : ZeroFieldOperator) 2694 (const char[][][] splitFile, string defaultHeader, const char[][] expectedValues) 2695 { 2696 import std.format : format; 2697 import std.array : appender; 2698 import std..string : chomp; 2699 import std.traits : EnumMembers; 2700 2701 auto numFields = (splitFile[0]).length; 2702 2703 assert(splitFile.length + 1 == expectedValues.length, 2704 format("[testZeroFieldOperator] Need one more expected value than number of rows. headerSuffix: %s", 2705 defaultHeader)); 2706 2707 /* printOptions - Not used these tests, but needed for API calls. */ 2708 auto printOptions = SummarizerPrintOptions('#', '|'); 2709 2710 /* Missing policy doesn't apply to zero field operators, but need the object for the summarizer. */ 2711 auto missingPolicy = new MissingFieldPolicy; 2712 2713 /* An input header line. */ 2714 string[] inputHeaderLine = new string[numFields]; 2715 foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string; 2716 2717 auto customOutputFieldHeader = "custom"; 2718 2719 enum HeaderUsecase { 2720 HeaderLine_DefaultHeader, 2721 HeaderLine_CustomHeader, 2722 NoHeaderLine_DefaultHeader, 2723 NoHeaderLine_CustomHeader, 2724 NoHeaderLine_NoOutputHeader, 2725 } 2726 2727 string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected) 2728 { 2729 return format("[testZeroFieldOperator] Unexpected header. Operator: %s; Usecase: %s; Actual: '%s'; Expected: '%s'", 2730 op.name, hc, actual, expected); 2731 } 2732 2733 string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex, 2734 const char[] actual, const char[] expected) 2735 { 2736 return format("[testZeroFieldOperator] Unexpected value. Operator: %s; Usecase: %s; RowIndex: %d\n Actual: '%s'; Expected: '%s'", 2737 op.name, hc, rowIndex, actual, expected); 2738 } 2739 2740 /* Run the logic for each header use case. */ 2741 foreach (hc; EnumMembers!HeaderUsecase) 2742 { 2743 bool hasInputHeader = ( 2744 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2745 hc == HeaderUsecase.HeaderLine_CustomHeader 2746 ); 2747 bool hasOutputHeader = ( 2748 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2749 hc == HeaderUsecase.HeaderLine_CustomHeader || 2750 hc == HeaderUsecase.NoHeaderLine_DefaultHeader || 2751 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2752 ); 2753 bool hasCustomHeader = ( 2754 hc == HeaderUsecase.HeaderLine_CustomHeader || 2755 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2756 ); 2757 2758 if (hasCustomHeader) assert(hasOutputHeader); 2759 2760 auto op = new OperatorClass(); 2761 2762 if (hasCustomHeader) 2763 { 2764 if (!op.allowCustomHeader) continue; // Custom header not support by this operator 2765 op.setCustomHeader(customOutputFieldHeader); 2766 } 2767 2768 Operator[] operatorArray; 2769 operatorArray ~= op; 2770 2771 auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy); 2772 summarizer.setOperators(inputRangeObject(operatorArray)); 2773 if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine); 2774 2775 if (hasOutputHeader) 2776 { 2777 /* Write the header line. Note that this is a one-field header, */ 2778 auto headerLineOutput = appender!(char[])(); 2779 summarizer.writeSummaryHeader(headerLineOutput, printOptions); 2780 2781 /* Test that the header was generated correctly. 2782 * 2783 * Note: Because the output is generated by a Summarizer, it will have a 2784 * trailing newline. Use chomp to trim it. 2785 */ 2786 final switch (hc) 2787 { 2788 case HeaderUsecase.HeaderLine_DefaultHeader: 2789 case HeaderUsecase.NoHeaderLine_DefaultHeader: 2790 assert(headerLineOutput.data.chomp == defaultHeader, 2791 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2792 defaultHeader)); 2793 break; 2794 case HeaderUsecase.HeaderLine_CustomHeader: 2795 case HeaderUsecase.NoHeaderLine_CustomHeader: 2796 assert(headerLineOutput.data.chomp == customOutputFieldHeader, 2797 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2798 customOutputFieldHeader)); 2799 break; 2800 case HeaderUsecase.NoHeaderLine_NoOutputHeader: 2801 break; 2802 } 2803 2804 } 2805 2806 /* For each line, process the line, generate the output, and test that the 2807 * value is correct. Start with the empty file case. 2808 */ 2809 foreach (i, const char[] expected; expectedValues) 2810 { 2811 if (i > 0) summarizer.processNextLine(splitFile[i - 1]); 2812 auto summaryLineOutput = appender!(char[])(); 2813 summarizer.writeSummaryBody(summaryLineOutput, printOptions); 2814 assert(summaryLineOutput.data.chomp == expected, 2815 valueAssertMessage(operatorArray[0], hc, i, 2816 summaryLineOutput.data.chomp, expectedValues[i])); 2817 } 2818 } 2819 } 2820 } 2821 2822 /* Specific operators. 2823 * 2824 * Notes: 2825 * - The 'Calculator' inner classes are 'static'. This means inner class instances do not 2826 * keep a reference to the context of the outer class. In exchange, Calculator instances 2827 * need to hold all needed state, typically the field index they are summarizing. 2828 */ 2829 2830 /** CountOperator counts the number of occurrences of each unique key, or the number of 2831 * input lines if there is no unique key. 2832 * 2833 * CountOperator differs from most other operators in that it doesn't summarize a specific 2834 * field on the line. Instead it is summarizing a property of the unique key itself. For 2835 * this reason it doesn't derive from SingleFieldOperator. 2836 */ 2837 final class CountOperator : ZeroFieldOperator 2838 { 2839 this() 2840 { 2841 super("count"); 2842 } 2843 2844 final override ZeroFieldCalculator makeCalculator() 2845 { 2846 return new CountCalculator(); 2847 } 2848 2849 static final class CountCalculator : ZeroFieldCalculator 2850 { 2851 private size_t _count = 0; 2852 2853 final override void processNextEntry() 2854 { 2855 _count++; 2856 } 2857 2858 final override string calculate(const ref SummarizerPrintOptions printOptions) 2859 { 2860 return printOptions.formatNumber(_count); 2861 } 2862 } 2863 } 2864 2865 unittest // CountOperator 2866 { 2867 auto col1File = [["10"], ["9.5"], ["11"]]; 2868 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 2869 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 2870 2871 testZeroFieldOperator!CountOperator(col1File, "count", ["0", "1", "2", "3"]); 2872 testZeroFieldOperator!CountOperator(col2File, "count", ["0", "1", "2", "3"]); 2873 testZeroFieldOperator!CountOperator(col3File, "count", ["0", "1", "2", "3"]); 2874 } 2875 2876 /** RetainOperator retains the first occurrence of a field, without changing the header. 2877 * 2878 * RetainOperator is intended for fields where the value is expected to be the same for 2879 * all occurrences of the unique key, and the goal is to pass the value through unchanged. 2880 * It is like FirstOperator, except that the original header is preserved. The original 2881 * header preservation is setup in the call to the SingleFieldOperation constructor. 2882 * 2883 * Notes: 2884 * - An option to signal an error if multiple values are encountered might be useful. 2885 */ 2886 final class RetainOperator : SingleFieldOperator 2887 { 2888 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2889 { 2890 super("retain", fieldIndex, missingPolicy, No.useHeaderSuffix, No.allowCustomHeader); 2891 } 2892 2893 final override SingleFieldCalculator makeCalculator() 2894 { 2895 return new RetainCalculator(fieldIndex); 2896 } 2897 2898 final class RetainCalculator : SingleFieldCalculator 2899 { 2900 private bool _done = false; 2901 private string _value = ""; 2902 2903 this(size_t fieldIndex) 2904 { 2905 super(fieldIndex); 2906 } 2907 2908 final override RetainOperator getOperator() 2909 { 2910 return this.outer; 2911 } 2912 2913 final override void processNextField(const char[] nextField) 2914 { 2915 if (!_done) 2916 { 2917 _value = nextField.to!string; 2918 _done = true; 2919 } 2920 } 2921 2922 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2923 { 2924 return _value; 2925 } 2926 } 2927 } 2928 2929 unittest // RetainOperator 2930 { 2931 auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]]; 2932 auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]]; 2933 auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]]; 2934 2935 testSingleFieldOperator!RetainOperator(col1File, 0, "", ["", "r1c1", "r1c1", "r1c1"]); 2936 testSingleFieldOperator!RetainOperator(col2File, 0, "", ["", "r1c1", "r1c1", "r1c1"]); 2937 testSingleFieldOperator!RetainOperator(col2File, 1, "", ["", "r1c2", "r1c2", "r1c2"]); 2938 testSingleFieldOperator!RetainOperator(col3File, 0, "", ["", "r1c1", "r1c1", "r1c1"]); 2939 testSingleFieldOperator!RetainOperator(col3File, 1, "", ["", "r1c2", "r1c2", "r1c2"]); 2940 testSingleFieldOperator!RetainOperator(col3File, 2, "", ["", "r1c3", "r1c3", "r1c3"]); 2941 2942 auto col1misFile = [[""], ["r2c1"], ["r3c1"]]; 2943 testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "", "r2c1", "r2c1"], 2944 new MissingFieldPolicy(true, "")); // Exclude missing 2945 testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "NA", "NA", "NA"], 2946 new MissingFieldPolicy(false, "NA")); // Replace missing 2947 } 2948 2949 /** FirstOperator outputs the first value found for the field. 2950 */ 2951 final class FirstOperator : SingleFieldOperator 2952 { 2953 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2954 { 2955 super("first", fieldIndex, missingPolicy); 2956 } 2957 2958 final override SingleFieldCalculator makeCalculator() 2959 { 2960 return new FirstCalculator(fieldIndex); 2961 } 2962 2963 final class FirstCalculator : SingleFieldCalculator 2964 { 2965 private bool _done = false; 2966 private string _value = ""; 2967 2968 this(size_t fieldIndex) 2969 { 2970 super(fieldIndex); 2971 } 2972 2973 final override FirstOperator getOperator() 2974 { 2975 return this.outer; 2976 } 2977 2978 final override void processNextField(const char[] nextField) 2979 { 2980 if (!_done) 2981 { 2982 _value = nextField.to!string; 2983 _done = true; 2984 } 2985 } 2986 2987 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2988 { 2989 return _value; 2990 } 2991 } 2992 } 2993 2994 unittest // FirstOperator 2995 { 2996 auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]]; 2997 auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]]; 2998 auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]]; 2999 3000 testSingleFieldOperator!FirstOperator(col1File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]); 3001 testSingleFieldOperator!FirstOperator(col2File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]); 3002 testSingleFieldOperator!FirstOperator(col2File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]); 3003 testSingleFieldOperator!FirstOperator(col3File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]); 3004 testSingleFieldOperator!FirstOperator(col3File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]); 3005 testSingleFieldOperator!FirstOperator(col3File, 2, "first", ["", "r1c3", "r1c3", "r1c3"]); 3006 3007 auto col1misFile = [[""], ["r2c1"], ["r3c1"]]; 3008 testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "", "r2c1", "r2c1"], 3009 new MissingFieldPolicy(true, "")); // Exclude missing 3010 testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "NA", "NA", "NA"], 3011 new MissingFieldPolicy(false, "NA")); // Replace missing 3012 } 3013 3014 /** LastOperator outputs the last value found for the field. 3015 */ 3016 final class LastOperator : SingleFieldOperator 3017 { 3018 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3019 { 3020 super("last", fieldIndex, missingPolicy); 3021 } 3022 3023 final override SingleFieldCalculator makeCalculator() 3024 { 3025 return new LastCalculator(fieldIndex); 3026 } 3027 3028 final class LastCalculator : SingleFieldCalculator 3029 { 3030 private string _value = ""; 3031 3032 this(size_t fieldIndex) 3033 { 3034 super(fieldIndex); 3035 } 3036 3037 final override LastOperator getOperator() 3038 { 3039 return this.outer; 3040 } 3041 3042 final override void processNextField(const char[] nextField) 3043 { 3044 _value = nextField.to!string; 3045 } 3046 3047 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3048 { 3049 return _value; 3050 } 3051 } 3052 } 3053 3054 unittest // LastOperator 3055 { 3056 auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]]; 3057 auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]]; 3058 auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]]; 3059 3060 testSingleFieldOperator!LastOperator(col1File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]); 3061 testSingleFieldOperator!LastOperator(col2File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]); 3062 testSingleFieldOperator!LastOperator(col2File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]); 3063 testSingleFieldOperator!LastOperator(col3File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]); 3064 testSingleFieldOperator!LastOperator(col3File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]); 3065 testSingleFieldOperator!LastOperator(col3File, 2, "last", ["", "r1c3", "r2c3", "r3c3"]); 3066 3067 auto col1misFile = [[""], ["r2c1"], ["r3c1"]]; 3068 testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "", "r2c1", "r3c1"], 3069 new MissingFieldPolicy(true, "")); // Exclude missing 3070 testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "NA", "r2c1", "r3c1"], 3071 new MissingFieldPolicy(false, "NA")); // Replace missing 3072 } 3073 3074 /** MinOperator output the minimum value for the field. This is a numeric operator. 3075 * 3076 * This operator returns the original string without additional numeric formatting. 3077 * This can be useful when joining back to the original data. This is different than 3078 * numeric operators that perform calculations. 3079 */ 3080 final class MinOperator : SingleFieldOperator 3081 { 3082 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3083 { 3084 super("min", fieldIndex, missingPolicy); 3085 } 3086 3087 final override SingleFieldCalculator makeCalculator() 3088 { 3089 return new MinCalculator(fieldIndex); 3090 } 3091 3092 final class MinCalculator : SingleFieldCalculator 3093 { 3094 private bool _isFirst = true; 3095 private double _value = double.nan; 3096 private string _originalString = "nan"; // Note: Cannot format floats at compile time (version 2.087) 3097 3098 this(size_t fieldIndex) 3099 { 3100 super(fieldIndex); 3101 } 3102 3103 final override MinOperator getOperator() 3104 { 3105 return this.outer; 3106 } 3107 3108 final override void processNextField(const char[] nextField) 3109 { 3110 double fieldValue = nextField.to!double; 3111 if (_isFirst) 3112 { 3113 _value = fieldValue; 3114 _originalString = nextField.to!string; 3115 _isFirst = false; 3116 } 3117 else if (fieldValue < _value) 3118 { 3119 _value = fieldValue; 3120 _originalString = nextField.to!string; 3121 } 3122 } 3123 3124 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3125 { 3126 return _originalString; 3127 } 3128 } 3129 } 3130 3131 unittest // MinOperator 3132 { 3133 auto col1File = [["10"], ["9.5"], ["11"]]; 3134 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3135 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 3136 3137 testSingleFieldOperator!MinOperator(col1File, 0, "min", ["nan", "10", "9.5", "9.5"]); 3138 testSingleFieldOperator!MinOperator(col2File, 0, "min", ["nan", "20", "20", "20"]); 3139 testSingleFieldOperator!MinOperator(col2File, 1, "min", ["nan", "-30", "-30", "-31"]); 3140 testSingleFieldOperator!MinOperator(col3File, 0, "min", ["nan", "9009", "199", "199"]); 3141 testSingleFieldOperator!MinOperator(col3File, 1, "min", ["nan", "9", "0", "0"]); 3142 testSingleFieldOperator!MinOperator(col3File, 2, "min", ["nan", "-4.5", "-4.5", "-4.5"]); 3143 3144 auto col1misFile = [[""], ["10"], ["-10"]]; 3145 testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "nan", "10", "-10"], 3146 new MissingFieldPolicy(true, "")); // Exclude missing 3147 testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "5", "5", "-10"], 3148 new MissingFieldPolicy(false, "5")); // Replace missing 3149 } 3150 3151 /** MaxOperator output the maximum value for the field. This is a numeric operator. 3152 * 3153 * This operator returns the original string without additional numeric formatting. 3154 * This can be useful when joining back to the original data. This is different than 3155 * numeric operators that perform calculations. 3156 */ 3157 final class MaxOperator : SingleFieldOperator 3158 { 3159 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3160 { 3161 super("max", fieldIndex, missingPolicy); 3162 } 3163 3164 final override SingleFieldCalculator makeCalculator() 3165 { 3166 return new MaxCalculator(fieldIndex); 3167 } 3168 3169 final class MaxCalculator : SingleFieldCalculator 3170 { 3171 private bool _isFirst = true; 3172 private double _value = double.nan; 3173 private string _originalString = "nan"; // Note: Cannot format floats at compile time (version 2.087) 3174 3175 this(size_t fieldIndex) 3176 { 3177 super(fieldIndex); 3178 } 3179 3180 final override MaxOperator getOperator() 3181 { 3182 return this.outer; 3183 } 3184 3185 final override void processNextField(const char[] nextField) 3186 { 3187 double fieldValue = nextField.to!double; 3188 if (_isFirst) 3189 { 3190 _value = fieldValue; 3191 _originalString = nextField.to!string; 3192 _isFirst = false; 3193 } 3194 else if (fieldValue > _value) 3195 { 3196 _value = fieldValue; 3197 _originalString = nextField.to!string; 3198 } 3199 } 3200 3201 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3202 { 3203 return _originalString; 3204 } 3205 } 3206 } 3207 3208 unittest // MaxOperator 3209 { 3210 auto col1File = [["10"], ["9.5"], ["11"]]; 3211 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3212 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 3213 3214 testSingleFieldOperator!MaxOperator(col1File, 0, "max", ["nan", "10", "10", "11"]); 3215 testSingleFieldOperator!MaxOperator(col2File, 0, "max", ["nan", "20", "21", "22"]); 3216 testSingleFieldOperator!MaxOperator(col2File, 1, "max", ["nan", "-30", "-29", "-29"]); 3217 testSingleFieldOperator!MaxOperator(col3File, 0, "max", ["nan", "9009", "9009", "9009"]); 3218 testSingleFieldOperator!MaxOperator(col3File, 1, "max", ["nan", "9", "9", "9"]); 3219 testSingleFieldOperator!MaxOperator(col3File, 2, "max", ["nan", "-4.5", "-0.5", "12"]); 3220 3221 auto col1misFile = [[""], ["-10"], ["10"]]; 3222 testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "nan", "-10", "10"], 3223 new MissingFieldPolicy(true, "")); // Exclude missing 3224 testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "5", "5", "10"], 3225 new MissingFieldPolicy(false, "5")); // Replace missing 3226 } 3227 3228 /** RangeOperator outputs the difference between the minimum and maximum values. 3229 * 3230 * If there is a single value, or all values are the same, the range is zero. This is 3231 * a numeric operator. 3232 */ 3233 final class RangeOperator : SingleFieldOperator 3234 { 3235 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3236 { 3237 super("range", fieldIndex, missingPolicy); 3238 } 3239 3240 final override SingleFieldCalculator makeCalculator() 3241 { 3242 return new RangeCalculator(fieldIndex); 3243 } 3244 3245 final class RangeCalculator : SingleFieldCalculator 3246 { 3247 private bool _isFirst = true; 3248 private double _minValue = 0.0; 3249 private double _maxValue = 0.0; 3250 3251 this(size_t fieldIndex) 3252 { 3253 super(fieldIndex); 3254 } 3255 3256 final override RangeOperator getOperator() 3257 { 3258 return this.outer; 3259 } 3260 3261 final override void processNextField(const char[] nextField) 3262 { 3263 double fieldValue = nextField.to!double; 3264 if (_isFirst) 3265 { 3266 _minValue = _maxValue = fieldValue; 3267 _isFirst = false; 3268 } 3269 else if (fieldValue > _maxValue) 3270 { 3271 _maxValue = fieldValue; 3272 } 3273 else if (fieldValue < _minValue) 3274 { 3275 _minValue = fieldValue; 3276 } 3277 } 3278 3279 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3280 { 3281 return printOptions.formatNumber(_maxValue - _minValue); 3282 } 3283 } 3284 } 3285 3286 unittest // RangeOperator 3287 { 3288 auto col1File = [["10"], ["9.5"], ["11"]]; 3289 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3290 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 3291 3292 testSingleFieldOperator!RangeOperator(col1File, 0, "range", ["0", "0", "0.5", "1.5"]); 3293 testSingleFieldOperator!RangeOperator(col2File, 0, "range", ["0", "0", "1", "2"]); 3294 testSingleFieldOperator!RangeOperator(col2File, 1, "range", ["0", "0", "1", "2"]); 3295 testSingleFieldOperator!RangeOperator(col3File, 0, "range", ["0", "0", "8810", "8810"]); 3296 testSingleFieldOperator!RangeOperator(col3File, 1, "range", ["0", "0", "9", "9"]); 3297 testSingleFieldOperator!RangeOperator(col3File, 2, "range", ["0", "0", "4", "16.5"]); 3298 3299 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]]; 3300 testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "0", "0", "0.5", "1.5"], 3301 new MissingFieldPolicy(true, "")); // Exclude missing 3302 testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "4.5", "4.5", "4.5", "5.5"], 3303 new MissingFieldPolicy(false, "5.5")); // Replace missing 3304 } 3305 3306 /** SumOperator produces the sum of all the values. This is a numeric operator. 3307 */ 3308 final class SumOperator : SingleFieldOperator 3309 { 3310 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3311 { 3312 super("sum", fieldIndex, missingPolicy); 3313 } 3314 3315 final override SingleFieldCalculator makeCalculator() 3316 { 3317 return new SumCalculator(fieldIndex); 3318 } 3319 3320 final class SumCalculator : SingleFieldCalculator 3321 { 3322 private double _total = 0.0; 3323 3324 this(size_t fieldIndex) 3325 { 3326 super(fieldIndex); 3327 } 3328 3329 final override SumOperator getOperator() 3330 { 3331 return this.outer; 3332 } 3333 3334 final override void processNextField(const char[] nextField) 3335 { 3336 _total += nextField.to!double; 3337 } 3338 3339 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3340 { 3341 return printOptions.formatNumber(_total); 3342 } 3343 } 3344 } 3345 3346 unittest // SumOperator 3347 { 3348 auto col1File = [["10"], ["9.5"], ["11"]]; 3349 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3350 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 3351 3352 testSingleFieldOperator!SumOperator(col1File, 0, "sum", ["0", "10", "19.5", "30.5"]); 3353 testSingleFieldOperator!SumOperator(col2File, 0, "sum", ["0", "20", "41", "63"]); 3354 testSingleFieldOperator!SumOperator(col2File, 1, "sum", ["0", "-30", "-59", "-90"]); 3355 testSingleFieldOperator!SumOperator(col3File, 0, "sum", ["0", "9009", "9208", "12211"]); 3356 testSingleFieldOperator!SumOperator(col3File, 1, "sum", ["0", "9", "9", "9.2"]); 3357 testSingleFieldOperator!SumOperator(col3File, 2, "sum", ["0", "-4.5", "-5", "7"]); 3358 3359 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]]; 3360 testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "0", "10", "10", "19.5", "30.5"], 3361 new MissingFieldPolicy(true, "")); // Exclude missing 3362 testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "1.5", "11.5", "13", "22.5", "33.5"], 3363 new MissingFieldPolicy(false, "1.5")); // Replace missing 3364 } 3365 3366 /** MeanOperator produces the mean (average) of all the values. This is a numeric operator. 3367 */ 3368 final class MeanOperator : SingleFieldOperator 3369 { 3370 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3371 { 3372 super("mean", fieldIndex, missingPolicy); 3373 } 3374 3375 final override SingleFieldCalculator makeCalculator() 3376 { 3377 return new MeanCalculator(fieldIndex); 3378 } 3379 3380 final class MeanCalculator : SingleFieldCalculator 3381 { 3382 private double _total = 0.0; 3383 private size_t _count = 0; 3384 3385 this(size_t fieldIndex) 3386 { 3387 super(fieldIndex); 3388 } 3389 3390 final override MeanOperator getOperator() 3391 { 3392 return this.outer; 3393 } 3394 3395 final override void processNextField(const char[] nextField) 3396 { 3397 _total += nextField.to!double; 3398 _count++; 3399 } 3400 3401 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3402 { 3403 return printOptions.formatNumber( 3404 (_count > 0) ? (_total / _count.to!double) : double.nan); 3405 } 3406 } 3407 } 3408 3409 unittest // MeanOperator 3410 { 3411 auto col1File = [["10"], ["9.5"], ["7.5"]]; 3412 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3413 auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]]; 3414 3415 testSingleFieldOperator!MeanOperator(col1File, 0, "mean", ["nan", "10", "9.75", "9"]); 3416 testSingleFieldOperator!MeanOperator(col2File, 0, "mean", ["nan", "20", "20.5", "21"]); 3417 testSingleFieldOperator!MeanOperator(col2File, 1, "mean", ["nan", "-30", "-29.5", "-30"]); 3418 testSingleFieldOperator!MeanOperator(col3File, 0, "mean", ["nan", "9009", "4509", "4509"]); 3419 testSingleFieldOperator!MeanOperator(col3File, 1, "mean", ["nan", "9", "4.5", "2"]); 3420 testSingleFieldOperator!MeanOperator(col3File, 2, "mean", ["nan", "-4.5", "-3", "2"]); 3421 3422 auto col1misFile = [[""], ["6"], [""], ["14"], ["40"]]; 3423 testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "nan", "6", "6", "10", "20"], 3424 new MissingFieldPolicy(true, "")); // Exclude missing 3425 testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "0", "3", "2", "5", "12"], 3426 new MissingFieldPolicy(false, "0")); // Replace missing 3427 } 3428 3429 /** MedianOperator produces the median of all the values. This is a numeric operator. 3430 * 3431 * All the field values are stored in memory as part of this calculation. This is 3432 * handled by unique key value lists. 3433 */ 3434 final class MedianOperator : SingleFieldOperator 3435 { 3436 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3437 { 3438 super("median", fieldIndex, missingPolicy); 3439 setSaveFieldValuesNumeric(); 3440 } 3441 3442 final override SingleFieldCalculator makeCalculator() 3443 { 3444 return new MedianCalculator(fieldIndex); 3445 } 3446 3447 final class MedianCalculator : SingleFieldCalculator 3448 { 3449 this(size_t fieldIndex) 3450 { 3451 super(fieldIndex); 3452 } 3453 3454 final override MedianOperator getOperator() 3455 { 3456 return this.outer; 3457 } 3458 3459 /* Work is done by saving the field values. */ 3460 final override void processNextField(const char[] nextField) 3461 { } 3462 3463 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3464 { 3465 return printOptions.formatNumber(valuesLists.numericValuesMedian(fieldIndex)); 3466 } 3467 } 3468 } 3469 3470 unittest // MedianOperator 3471 { 3472 auto col1File = [["10"], ["9.5"], ["7.5"]]; 3473 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3474 auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]]; 3475 3476 testSingleFieldOperator!MedianOperator(col1File, 0, "median", ["nan", "10", "9.75", "9.5"]); 3477 testSingleFieldOperator!MedianOperator(col2File, 0, "median", ["nan", "20", "20.5", "21"]); 3478 testSingleFieldOperator!MedianOperator(col2File, 1, "median", ["nan", "-30", "-29.5", "-30"]); 3479 testSingleFieldOperator!MedianOperator(col3File, 0, "median", ["nan", "9009", "4509", "4509"]); 3480 testSingleFieldOperator!MedianOperator(col3File, 1, "median", ["nan", "9", "4.5", "0"]); 3481 testSingleFieldOperator!MedianOperator(col3File, 2, "median", ["nan", "-4.5", "-3", "-1.5"]); 3482 3483 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]]; 3484 testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "nan", "10", "10", "9.75", "9.5"], 3485 new MissingFieldPolicy(true, "")); // Exclude missing 3486 testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "0", "5", "0", "4.75", "7.5"], 3487 new MissingFieldPolicy(false, "0")); // Replace missing 3488 } 3489 3490 /** QuantileOperator produces the value representing the data at a cummulative probability. 3491 * This is a numeric operation. 3492 * 3493 * As an example, quantiles might be produced for the 0.25, 0.5, and 0.75 probabilities 3494 * (alternately, the 25th, 50th, and 75th percentile ranks, the 50th percentile being the 3495 * median). Data is sorted is ascending order. This operator takes one percentile, but it 3496 * is common to generate multiple quantile ranks for the same field when summarizing. 3497 * 3498 * All the field's values are stored in memory as part of this calculation. This is 3499 * handled by unique key value lists. 3500 */ 3501 final class QuantileOperator : SingleFieldOperator 3502 { 3503 private double _prob; 3504 3505 this(size_t fieldIndex, MissingFieldPolicy missingPolicy, double probability) 3506 { 3507 assert(0.0 <= probability && probability <= 1.0); 3508 import std.format : format; 3509 3510 string header = (probability == 0.0) ? "pct0" : format("pct%02g", probability * 100.0); 3511 super(header, fieldIndex, missingPolicy); 3512 _prob = probability; 3513 setSaveFieldValuesNumeric(); 3514 } 3515 3516 final override SingleFieldCalculator makeCalculator() 3517 { 3518 return new QuantileCalculator(fieldIndex); 3519 } 3520 3521 final class QuantileCalculator : SingleFieldCalculator 3522 { 3523 this(size_t fieldIndex) 3524 { 3525 super(fieldIndex); 3526 } 3527 3528 final override QuantileOperator getOperator() 3529 { 3530 return this.outer; 3531 } 3532 3533 /* Work is done by saving the field values. */ 3534 final override void processNextField(const char[] nextField) 3535 { } 3536 3537 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3538 { 3539 import tsv_utils.common.numerics : quantile; 3540 return printOptions.formatNumber( 3541 quantile(this.outer._prob, valuesLists.numericValuesSorted(fieldIndex))); 3542 } 3543 } 3544 } 3545 3546 unittest // QuantileOperator 3547 { 3548 auto col1File = [["10"], ["9.5"], ["7.5"]]; 3549 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3550 auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]]; 3551 3552 auto defaultMissing = new MissingFieldPolicy; 3553 3554 /* Same as the median tests. */ 3555 testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct50", ["nan", "10", "9.75", "9.5"], defaultMissing, 0.50); 3556 testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct50", ["nan", "20", "20.5", "21"], defaultMissing, 0.50); 3557 testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct50", ["nan", "-30", "-29.5", "-30"], defaultMissing, 0.50); 3558 testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct50", ["nan", "9009", "4509", "4509"], defaultMissing, 0.50); 3559 testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct50", ["nan", "9", "4.5", "0"], defaultMissing, 0.50); 3560 testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct50", ["nan", "-4.5", "-3", "-1.5"], defaultMissing, 0.50); 3561 3562 /* The extremes (0, 1), are min and max. */ 3563 testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct0", ["nan", "10", "9.5", "7.5"], defaultMissing, 0.0); 3564 testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct0", ["nan", "20", "20", "20"], defaultMissing, 0.0); 3565 testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct0", ["nan", "-30", "-30", "-31"], defaultMissing, 0.0); 3566 testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct0", ["nan", "9009", "9", "9"], defaultMissing, 0.0); 3567 testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct0", ["nan", "9", "0", "-3"], defaultMissing, 0.0); 3568 testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct0", ["nan", "-4.5", "-4.5", "-4.5"], defaultMissing, 0.0); 3569 3570 testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct100", ["nan", "10", "10", "10"], defaultMissing, 1.0); 3571 testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct100", ["nan", "20", "21", "22"], defaultMissing, 1.0); 3572 testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct100", ["nan", "-30", "-29", "-29"], defaultMissing, 1.0); 3573 testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct100", ["nan", "9009", "9009", "9009"], defaultMissing, 1.0); 3574 testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct100", ["nan", "9", "9", "9"], defaultMissing, 1.0); 3575 testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct100", ["nan", "-4.5", "-1.5", "12"], defaultMissing, 1.0); 3576 3577 /* For missing policies, re-use the median tests. */ 3578 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]]; 3579 testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "nan", "10", "10", "9.75", "9.5"], 3580 new MissingFieldPolicy(true, ""), 0.5); // Exclude missing 3581 testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "0", "5", "0", "4.75", "7.5"], 3582 new MissingFieldPolicy(false, "0"), 0.5); // Replace missing 3583 } 3584 3585 /** MadOperator produces the median absolute deviation from the median. This is a numeric 3586 * operation. 3587 * 3588 * The result is the raw MAD value, without a normalization applied. 3589 * 3590 * All the field values are stored in memory as part of this calculation. This is 3591 * handled by unique key value lists. 3592 */ 3593 final class MadOperator : SingleFieldOperator 3594 { 3595 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3596 { 3597 super("mad", fieldIndex, missingPolicy); 3598 setSaveFieldValuesNumeric(); 3599 } 3600 3601 final override SingleFieldCalculator makeCalculator() 3602 { 3603 return new MadCalculator(fieldIndex); 3604 } 3605 3606 final class MadCalculator : SingleFieldCalculator 3607 { 3608 this(size_t fieldIndex) 3609 { 3610 super(fieldIndex); 3611 } 3612 3613 final override MadOperator getOperator() 3614 { 3615 return this.outer; 3616 } 3617 3618 /* Work is done by saving the field values. */ 3619 final override void processNextField(const char[] nextField) 3620 { } 3621 3622 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3623 { 3624 import std.math : abs; 3625 import tsv_utils.common.numerics : rangeMedian; 3626 3627 auto median = valuesLists.numericValuesMedian(fieldIndex); 3628 auto values = valuesLists.numericValues(fieldIndex); 3629 auto medianDevs = new double[values.length]; 3630 foreach (size_t i, double v; values) 3631 medianDevs[i] = abs(v - median); 3632 3633 return printOptions.formatNumber(medianDevs.rangeMedian); 3634 } 3635 } 3636 } 3637 3638 unittest // MadOperator 3639 { 3640 auto col1File = [["10"], ["15"], ["20"], ["25"], ["30"]]; 3641 auto col2File = [["2", "50"], ["2", "51"], ["2", "52"]]; 3642 auto col3File = [["16", "8", "-4"], ["8", "8", "-2"], ["8", "16", "0"]]; 3643 3644 testSingleFieldOperator!MadOperator(col1File, 0, "mad", ["nan", "0", "2.5", "5", "5", "5"]); 3645 testSingleFieldOperator!MadOperator(col2File, 0, "mad", ["nan", "0", "0", "0"]); 3646 testSingleFieldOperator!MadOperator(col2File, 1, "mad", ["nan", "0", "0.5", "1"]); 3647 testSingleFieldOperator!MadOperator(col3File, 0, "mad", ["nan", "0", "4", "0"]); 3648 testSingleFieldOperator!MadOperator(col3File, 1, "mad", ["nan", "0", "0", "0"]); 3649 testSingleFieldOperator!MadOperator(col3File, 2, "mad", ["nan", "0", "1", "2"]); 3650 3651 auto col1misFile = [[""], ["16"], [""], ["32"], ["-4"]]; 3652 testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "nan", "0", "0", "8", "16"], 3653 new MissingFieldPolicy(true, "")); // Exclude missing 3654 testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "0", "8", "0", "8", "4"], 3655 new MissingFieldPolicy(false, "0")); // Replace missing 3656 } 3657 3658 /** Generates the variance of the fields values. This is a numeric operator. 3659 */ 3660 final class VarianceOperator : SingleFieldOperator 3661 { 3662 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3663 { 3664 super("var", fieldIndex, missingPolicy); 3665 } 3666 3667 final override SingleFieldCalculator makeCalculator() 3668 { 3669 return new VarianceCalculator(fieldIndex); 3670 } 3671 3672 final class VarianceCalculator : SingleFieldCalculator 3673 { 3674 private double _count = 0.0; 3675 private double _mean = 0.0; 3676 private double _m2 = 0.0; // Sum of squares of differences from current mean 3677 3678 this(size_t fieldIndex) 3679 { 3680 super(fieldIndex); 3681 } 3682 3683 final override VarianceOperator getOperator() 3684 { 3685 return this.outer; 3686 } 3687 3688 final override void processNextField(const char[] nextField) 3689 { 3690 _count += 1.0; 3691 double fieldValue = nextField.to!double; 3692 double delta = fieldValue - _mean; 3693 _mean += delta / _count; 3694 _m2 += delta * (fieldValue - _mean); 3695 } 3696 3697 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3698 { 3699 return printOptions.formatNumber( 3700 (_count >= 2.0) ? (_m2 / (_count - 1.0)) : double.nan); 3701 } 3702 } 3703 } 3704 3705 unittest // VarianceOperator 3706 { 3707 auto col1File = [["5"], ["10"], ["15"]]; 3708 auto col2File = [["-5", "-5"], ["-10", "0"], ["-15", "5"]]; 3709 auto col3File = [["1", "2", "100"], ["2", "3", "100"], ["3", "4", "103"]]; 3710 3711 testSingleFieldOperator!VarianceOperator(col1File, 0, "var", ["nan", "nan", "12.5", "25"]); 3712 testSingleFieldOperator!VarianceOperator(col2File, 0, "var", ["nan", "nan", "12.5", "25"]); 3713 testSingleFieldOperator!VarianceOperator(col2File, 1, "var", ["nan", "nan", "12.5", "25"]); 3714 testSingleFieldOperator!VarianceOperator(col3File, 0, "var", ["nan", "nan", "0.5", "1"]); 3715 testSingleFieldOperator!VarianceOperator(col3File, 1, "var", ["nan", "nan", "0.5", "1"]); 3716 testSingleFieldOperator!VarianceOperator(col3File, 2, "var", ["nan", "nan", "0", "3"]); 3717 3718 auto col1misFile = [["5"], ["10"], [""]]; 3719 testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "12.5"], 3720 new MissingFieldPolicy(true, "")); // Exclude missing 3721 testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "25"], 3722 new MissingFieldPolicy(false, "15")); // Replace missing 3723 } 3724 3725 /** Generates the standard deviation of the fields values. This is a numeric operator. 3726 */ 3727 final class StDevOperator : SingleFieldOperator 3728 { 3729 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3730 { 3731 super("stdev", fieldIndex, missingPolicy); 3732 } 3733 3734 final override SingleFieldCalculator makeCalculator() 3735 { 3736 return new StDevCalculator(fieldIndex); 3737 } 3738 3739 final class StDevCalculator : SingleFieldCalculator 3740 { 3741 private double _count = 0.0; 3742 private double _mean = 0.0; 3743 private double _m2 = 0.0; // Sum of squares of differences from current mean 3744 3745 this(size_t fieldIndex) 3746 { 3747 super(fieldIndex); 3748 } 3749 3750 final override StDevOperator getOperator() 3751 { 3752 return this.outer; 3753 } 3754 3755 final override void processNextField(const char[] nextField) 3756 { 3757 _count += 1.0; 3758 double fieldValue = nextField.to!double; 3759 double delta = fieldValue - _mean; 3760 _mean += delta / _count; 3761 _m2 += delta * (fieldValue - _mean); 3762 } 3763 3764 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3765 { 3766 import std.math : sqrt; 3767 return printOptions.formatNumber( 3768 (_count >= 2.0) ? (_m2 / (_count - 1.0)).sqrt : double.nan); 3769 } 3770 } 3771 } 3772 3773 /* StDevOperator unit tests - These would be improved with a tolerance option. 3774 */ 3775 unittest 3776 { 3777 auto col1File = [["1"], ["4"], ["7"]]; 3778 auto col2File = [["3", "3"], ["3", "9"], ["7", "15"]]; 3779 auto col3File = [["11", "10", "10"], ["24", "22", "25"], ["37", "34", "40"]]; 3780 3781 testSingleFieldOperator!StDevOperator(col1File, 0, "stdev", ["nan", "nan", "2.12132034356", "3"]); 3782 testSingleFieldOperator!StDevOperator(col2File, 0, "stdev", ["nan", "nan", "0", "2.30940107676"]); 3783 testSingleFieldOperator!StDevOperator(col2File, 1, "stdev", ["nan", "nan", "4.24264068712", "6"]); 3784 testSingleFieldOperator!StDevOperator(col3File, 0, "stdev", ["nan", "nan", "9.19238815543", "13"]); 3785 testSingleFieldOperator!StDevOperator(col3File, 1, "stdev", ["nan", "nan", "8.48528137424", "12"]); 3786 testSingleFieldOperator!StDevOperator(col3File, 2, "stdev", ["nan", "nan", "10.6066017178", "15"]); 3787 3788 auto col1misFile = [["1"], ["4"], [""]]; 3789 testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "2.12132034356"], 3790 new MissingFieldPolicy(true, "")); // Exclude missing 3791 testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "3"], 3792 new MissingFieldPolicy(false, "7")); // Replace missing 3793 } 3794 3795 /** UniqueCountOperator generates the number of unique values. Unique values are 3796 * based on exact text match calculation, not a numeric comparison. 3797 * 3798 * All the unique field values are stored in memory as part of this calculation. 3799 */ 3800 final class UniqueCountOperator : SingleFieldOperator 3801 { 3802 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3803 { 3804 super("unique_count", fieldIndex, missingPolicy); 3805 } 3806 3807 final override SingleFieldCalculator makeCalculator() 3808 { 3809 return new UniqueCountCalculator(fieldIndex); 3810 } 3811 3812 final class UniqueCountCalculator : SingleFieldCalculator 3813 { 3814 private bool[string] _values; 3815 3816 this(size_t fieldIndex) 3817 { 3818 super(fieldIndex); 3819 } 3820 3821 final override UniqueCountOperator getOperator() 3822 { 3823 return this.outer; 3824 } 3825 3826 final override void processNextField(const char[] nextField) 3827 { 3828 if (nextField !in _values) _values[nextField.to!string] = true; 3829 } 3830 3831 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3832 { 3833 return printOptions.formatNumber(_values.length); 3834 } 3835 } 3836 } 3837 3838 unittest // UniqueCount 3839 { 3840 auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]]; 3841 auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]]; 3842 auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]]; 3843 3844 testSingleFieldOperator!UniqueCountOperator(col1File, 0, "unique_count", ["0", "1", "2", "3", "3", "3", "3", "3", "4"]); 3845 testSingleFieldOperator!UniqueCountOperator(col2File, 0, "unique_count", ["0", "1", "2", "2"]); 3846 testSingleFieldOperator!UniqueCountOperator(col2File, 1, "unique_count", ["0", "1", "1", "2"]); 3847 testSingleFieldOperator!UniqueCountOperator(col3File, 0, "unique_count", ["0", "1", "2", "3"]); 3848 testSingleFieldOperator!UniqueCountOperator(col3File, 1, "unique_count", ["0", "1", "2", "2"]); 3849 testSingleFieldOperator!UniqueCountOperator(col3File, 2, "unique_count", ["0", "1", "2", "3"]); 3850 3851 auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]]; 3852 testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "0", "1", "1", "2", "3", "3", "3", "3", "3", "4"], 3853 new MissingFieldPolicy(true, "")); // Exclude missing 3854 3855 3856 testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "1", "2", "2", "3", "4", "4", "4", "4", "4", "5"], 3857 new MissingFieldPolicy(false, "XYZ")); // Replace missing 3858 } 3859 3860 /** MissingCountOperator generates the number of missing values. This overrides 3861 * the global missingFieldsPolicy. 3862 */ 3863 final class MissingCountOperator : SingleFieldOperator 3864 { 3865 private MissingFieldPolicy _globalMissingPolicy; 3866 3867 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3868 { 3869 _globalMissingPolicy = missingPolicy; 3870 super("missing_count", fieldIndex, new MissingFieldPolicy(false, "")); 3871 } 3872 3873 final override SingleFieldCalculator makeCalculator() 3874 { 3875 return new MissingCountCalculator(fieldIndex); 3876 } 3877 3878 final class MissingCountCalculator : SingleFieldCalculator 3879 { 3880 private size_t _missingCount = 0; 3881 3882 this(size_t fieldIndex) 3883 { 3884 super(fieldIndex); 3885 } 3886 3887 final override MissingCountOperator getOperator() 3888 { 3889 return this.outer; 3890 } 3891 3892 final override void processNextField(const char[] nextField) 3893 { 3894 if (this.outer._globalMissingPolicy.isMissingField(nextField)) _missingCount++; 3895 } 3896 3897 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3898 { 3899 return printOptions.formatNumber(_missingCount); 3900 } 3901 } 3902 } 3903 3904 unittest // MissingCount 3905 { 3906 auto col1File = [["a"], ["b"], [""], [" "], [""]]; 3907 auto col2File = [["abc", ""], ["", ""], ["def", ""]]; 3908 auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]]; 3909 3910 testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"]); 3911 testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"]); 3912 testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"]); 3913 testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"]); 3914 testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"]); 3915 testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"]); 3916 3917 auto excludeMissing = new MissingFieldPolicy(true, ""); 3918 auto replaceMissing = new MissingFieldPolicy(false, "X"); 3919 3920 testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], excludeMissing); 3921 testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], excludeMissing); 3922 testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], excludeMissing); 3923 testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], excludeMissing); 3924 testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], excludeMissing); 3925 testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], excludeMissing); 3926 3927 testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], replaceMissing); 3928 testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], replaceMissing); 3929 testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], replaceMissing); 3930 testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], replaceMissing); 3931 testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], replaceMissing); 3932 testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], replaceMissing); 3933 } 3934 3935 /** NotMissingCountOperator generates the number of not-missing values. This overrides 3936 * the global missingFieldsPolicy. 3937 */ 3938 final class NotMissingCountOperator : SingleFieldOperator 3939 { 3940 private MissingFieldPolicy _globalMissingPolicy; 3941 3942 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3943 { 3944 _globalMissingPolicy = missingPolicy; 3945 super("not_missing_count", fieldIndex, new MissingFieldPolicy(false, "")); 3946 } 3947 3948 final override SingleFieldCalculator makeCalculator() 3949 { 3950 return new NotMissingCountCalculator(fieldIndex); 3951 } 3952 3953 final class NotMissingCountCalculator : SingleFieldCalculator 3954 { 3955 private size_t _notMissingCount = 0; 3956 3957 this(size_t fieldIndex) 3958 { 3959 super(fieldIndex); 3960 } 3961 3962 final override NotMissingCountOperator getOperator() 3963 { 3964 return this.outer; 3965 } 3966 3967 final override void processNextField(const char[] nextField) 3968 { 3969 if (!this.outer._globalMissingPolicy.isMissingField(nextField)) _notMissingCount++; 3970 } 3971 3972 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3973 { 3974 return printOptions.formatNumber(_notMissingCount); 3975 } 3976 } 3977 } 3978 3979 unittest // NotMissingCount 3980 { 3981 auto col1File = [["a"], ["b"], [""], [" "], [""]]; 3982 auto col2File = [["abc", ""], ["", ""], ["def", ""]]; 3983 auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]]; 3984 3985 testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"]); 3986 testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"]); 3987 testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"]); 3988 testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"]); 3989 testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"]); 3990 testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"]); 3991 3992 auto excludeMissing = new MissingFieldPolicy(true, ""); 3993 auto replaceMissing = new MissingFieldPolicy(false, "X"); 3994 3995 testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], excludeMissing); 3996 testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], excludeMissing); 3997 testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], excludeMissing); 3998 testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], excludeMissing); 3999 testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], excludeMissing); 4000 testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], excludeMissing); 4001 4002 testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], replaceMissing); 4003 testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], replaceMissing); 4004 testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], replaceMissing); 4005 testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], replaceMissing); 4006 testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], replaceMissing); 4007 testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], replaceMissing); 4008 } 4009 4010 /** ModeOperator outputs the most frequent value seen. In the event of a tie, the 4011 * first value seen is produced. 4012 * 4013 * All the field values are stored in memory as part of this calculation. 4014 * 4015 */ 4016 final class ModeOperator : SingleFieldOperator 4017 { 4018 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 4019 { 4020 super("mode", fieldIndex, missingPolicy); 4021 } 4022 4023 final override SingleFieldCalculator makeCalculator() 4024 { 4025 return new ModeCalculator(fieldIndex); 4026 } 4027 4028 final class ModeCalculator : SingleFieldCalculator 4029 { 4030 private size_t[string] _valueCounts; 4031 private Appender!(string[]) _uniqueValues; 4032 4033 this(size_t fieldIndex) 4034 { 4035 super(fieldIndex); 4036 } 4037 4038 final override ModeOperator getOperator() 4039 { 4040 return this.outer; 4041 } 4042 4043 final override void processNextField(const char[] nextField) 4044 { 4045 auto countPtr = (nextField in _valueCounts); 4046 4047 if (countPtr is null) 4048 { 4049 string value = nextField.to!string; 4050 _uniqueValues.put(value); 4051 _valueCounts[value] = 1; 4052 } 4053 else 4054 { 4055 (*countPtr)++; 4056 } 4057 } 4058 4059 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 4060 { 4061 string modeValue = ""; 4062 size_t modeCount = 0; 4063 4064 foreach (value; _uniqueValues.data) 4065 { 4066 assert(value in _valueCounts); 4067 4068 auto count = _valueCounts[value]; 4069 4070 if (count > modeCount) 4071 { 4072 modeValue = value; 4073 modeCount = count; 4074 } 4075 } 4076 4077 return modeValue; 4078 } 4079 } 4080 } 4081 4082 unittest // ModeOperator 4083 { 4084 auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]]; 4085 auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]]; 4086 auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]]; 4087 4088 testSingleFieldOperator!ModeOperator(col1File, 0, "mode", ["", "a", "a", "a", "c", "b", "b", "b"]); 4089 testSingleFieldOperator!ModeOperator(col2File, 0, "mode", ["", "abc", "abc", "def"]); 4090 testSingleFieldOperator!ModeOperator(col2File, 1, "mode", ["", "pqr", "pqr", "pqr"]); 4091 testSingleFieldOperator!ModeOperator(col3File, 0, "mode", ["", "1.0", "1.0", "1.0"]); 4092 testSingleFieldOperator!ModeOperator(col3File, 1, "mode", ["", "1", "1", "a"]); 4093 testSingleFieldOperator!ModeOperator(col3File, 2, "mode", ["", "a", "a", "a"]); 4094 4095 auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]]; 4096 testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "", "a", "a", "a", "a", "c", "b", "b"], 4097 new MissingFieldPolicy(true, "")); // Exclude missing 4098 4099 4100 testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "X", "X", "X", "X", "X", "X", "X", "b"], 4101 new MissingFieldPolicy(false, "X")); // Replace missing 4102 } 4103 4104 /** ModeCountOperator outputs the count of the most frequent value seen. 4105 * 4106 * All the field values are stored in memory as part of this calculation. 4107 * 4108 */ 4109 final class ModeCountOperator : SingleFieldOperator 4110 { 4111 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 4112 { 4113 super("mode_count", fieldIndex, missingPolicy); 4114 } 4115 4116 final override SingleFieldCalculator makeCalculator() 4117 { 4118 return new ModeCountCalculator(fieldIndex); 4119 } 4120 4121 final class ModeCountCalculator : SingleFieldCalculator 4122 { 4123 private size_t[string] _valueCounts; 4124 4125 this(size_t fieldIndex) 4126 { 4127 super(fieldIndex); 4128 } 4129 4130 final override ModeCountOperator getOperator() 4131 { 4132 return this.outer; 4133 } 4134 4135 final override void processNextField(const char[] nextField) 4136 { 4137 auto countPtr = (nextField in _valueCounts); 4138 4139 if (countPtr is null) 4140 { 4141 string value = nextField.to!string; 4142 _valueCounts[value] = 1; 4143 } 4144 else 4145 { 4146 (*countPtr)++; 4147 } 4148 } 4149 4150 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 4151 { 4152 size_t modeCount = 0; 4153 foreach (count; _valueCounts.byValue) if (count > modeCount) modeCount = count; 4154 return printOptions.formatNumber(modeCount); 4155 } 4156 } 4157 } 4158 4159 unittest // ModeCountOperator 4160 { 4161 auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]]; 4162 auto col2File = [["abc", ""], ["def", ""], ["def", "xyz"]]; 4163 auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]]; 4164 4165 testSingleFieldOperator!ModeCountOperator(col1File, 0, "mode_count", ["0", "1", "1", "1", "2", "2", "3", "3"]); 4166 testSingleFieldOperator!ModeCountOperator(col2File, 0, "mode_count", ["0", "1", "1", "2"]); 4167 testSingleFieldOperator!ModeCountOperator(col2File, 1, "mode_count", ["0", "1", "2", "2"]); 4168 testSingleFieldOperator!ModeCountOperator(col3File, 0, "mode_count", ["0", "1", "1", "1"]); 4169 testSingleFieldOperator!ModeCountOperator(col3File, 1, "mode_count", ["0", "1", "1", "2"]); 4170 testSingleFieldOperator!ModeCountOperator(col3File, 2, "mode_count", ["0", "1", "1", "1"]); 4171 4172 auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]]; 4173 testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "0", "1", "1", "1", "1", "2", "2", "3"], 4174 new MissingFieldPolicy(true, "")); // Exclude missing 4175 4176 4177 testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "1", "1", "2", "2", "2", "2", "2", "3"], 4178 new MissingFieldPolicy(false, "X")); // Replace missing 4179 } 4180 4181 /** ValuesOperator outputs each value delimited by an alternate delimiter character. 4182 * 4183 * All the field values are stored in memory as part of this calculation. This is 4184 * handled by unique key value lists. 4185 */ 4186 4187 final class ValuesOperator : SingleFieldOperator 4188 { 4189 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 4190 { 4191 super("values", fieldIndex, missingPolicy); 4192 setSaveFieldValuesText(); 4193 } 4194 4195 final override SingleFieldCalculator makeCalculator() 4196 { 4197 return new ValuesCalculator(fieldIndex); 4198 } 4199 4200 final class ValuesCalculator : SingleFieldCalculator 4201 { 4202 this(size_t fieldIndex) 4203 { 4204 super(fieldIndex); 4205 } 4206 4207 final override ValuesOperator getOperator() 4208 { 4209 return this.outer; 4210 } 4211 4212 /* Work is done by saving the field values. */ 4213 final override void processNextField(const char[] nextField) 4214 { } 4215 4216 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 4217 { 4218 return valuesLists.textValues(fieldIndex).join(printOptions.valuesDelimiter); 4219 } 4220 } 4221 } 4222 4223 unittest // ValuesOperator 4224 { 4225 auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]]; 4226 auto col2File = [["", "50"], ["", "51"], ["xyz", "52"]]; 4227 auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "---"]]; 4228 4229 testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e|", "a||b|cd|e||a"]); 4230 testSingleFieldOperator!ValuesOperator(col2File, 0, "values", ["", "", "|", "||xyz"]); 4231 testSingleFieldOperator!ValuesOperator(col2File, 1, "values", ["", "50", "50|51", "50|51|52"]); 4232 testSingleFieldOperator!ValuesOperator(col3File, 0, "values", ["", "z", "z|y", "z|y|w"]); 4233 testSingleFieldOperator!ValuesOperator(col3File, 1, "values", ["", "a", "a|ab", "a|ab|ba"]); 4234 testSingleFieldOperator!ValuesOperator(col3File, 2, "values", ["", "-", "-|--", "-|--|---"]); 4235 4236 testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e|a"], 4237 new MissingFieldPolicy(true, "")); // Exclude missing 4238 4239 4240 testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e|X", "a|X|b|cd|e|X|a"], 4241 new MissingFieldPolicy(false, "X")); // Replace missing 4242 } 4243 4244 /** UniqueValuesOperator outputs each unique value delimited by an alternate delimiter 4245 * character. Values are output in the order seen. 4246 * 4247 * All unique field values are stored in memory as part of this calculation. 4248 * 4249 */ 4250 final class UniqueValuesOperator : SingleFieldOperator 4251 { 4252 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 4253 { 4254 super("unique_values", fieldIndex, missingPolicy); 4255 } 4256 4257 final override SingleFieldCalculator makeCalculator() 4258 { 4259 return new UniqueValuesCalculator(fieldIndex); 4260 } 4261 4262 final class UniqueValuesCalculator : SingleFieldCalculator 4263 { 4264 private size_t[string] _valuesHash; 4265 private Appender!(string[]) _uniqueValues; 4266 4267 this(size_t fieldIndex) 4268 { 4269 super(fieldIndex); 4270 } 4271 4272 final override UniqueValuesOperator getOperator() 4273 { 4274 return this.outer; 4275 } 4276 4277 final override void processNextField(const char[] nextField) 4278 { 4279 auto ptr = (nextField in _valuesHash); 4280 4281 if (ptr is null) 4282 { 4283 string value = nextField.to!string; 4284 _uniqueValues.put(value); 4285 _valuesHash[value] = 1; 4286 } 4287 } 4288 4289 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 4290 { 4291 return _uniqueValues.data.join(printOptions.valuesDelimiter); 4292 } 4293 } 4294 } 4295 4296 unittest // UniqueValuesOperator 4297 { 4298 auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]]; 4299 auto col2File = [["", "50"], ["", "50"], ["xyz", "52"]]; 4300 auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "-"]]; 4301 4302 testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e", "a||b|cd|e"]); 4303 testSingleFieldOperator!UniqueValuesOperator(col2File, 0, "unique_values", ["", "", "", "|xyz"]); 4304 testSingleFieldOperator!UniqueValuesOperator(col2File, 1, "unique_values", ["", "50", "50", "50|52"]); 4305 testSingleFieldOperator!UniqueValuesOperator(col3File, 0, "unique_values", ["", "z", "z|y", "z|y|w"]); 4306 testSingleFieldOperator!UniqueValuesOperator(col3File, 1, "unique_values", ["", "a", "a|ab", "a|ab|ba"]); 4307 testSingleFieldOperator!UniqueValuesOperator(col3File, 2, "unique_values", ["", "-", "-|--", "-|--"]); 4308 4309 testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e"], 4310 new MissingFieldPolicy(true, "")); // Exclude missing 4311 4312 4313 testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e", "a|X|b|cd|e"], 4314 new MissingFieldPolicy(false, "X")); // Replace missing 4315 }