1 /** 2 Command line tool that reads TSV files and summarizes field values associated with 3 equivalent keys. 4 5 Copyright (c) 2016-2020, eBay Inc. 6 Initially written by Jon Degenhardt 7 8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 9 */ 10 module tsv_utils.tsv_summarize; 11 12 import std.algorithm : all, any, canFind, each, find, findSplit, map, joiner, splitter; 13 import std.array : join; 14 import std.conv : to; 15 import std.exception : enforce; 16 import std.format : format; 17 import std.range; 18 import std.stdio; 19 import std.typecons : tuple; 20 import std.container : DList; 21 22 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 23 24 version(unittest) 25 { 26 // When running unit tests, use main from -main compiler switch. 27 } 28 else 29 { 30 int main(string[] cmdArgs) 31 { 32 /* When running in DMD code coverage mode, turn on report merging. */ 33 version(D_Coverage) version(DigitalMars) 34 { 35 import core.runtime : dmd_coverSetMerge; 36 dmd_coverSetMerge(true); 37 } 38 39 TsvSummarizeOptions cmdopt; 40 auto r = cmdopt.processArgs(cmdArgs); 41 if (!r[0]) return r[1]; 42 version(LDC_Profile) 43 { 44 import ldc.profile : resetAll; 45 resetAll(); 46 } 47 try tsvSummarize(cmdopt); 48 catch (Exception exc) 49 { 50 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 51 return 1; 52 } 53 return 0; 54 } 55 } 56 57 auto helpTextVerbose = q"EOS 58 Synopsis: tsv-summarize [options] file [file...] 59 60 tsv-summarize reads tabular data files (tab-separated by default), tracks 61 field values for each unique key, and runs summarization algorithms. Consider 62 the file data.tsv: 63 64 Make Color Time 65 ford blue 131 66 chevy green 124 67 ford red 128 68 bmw black 118 69 bmw black 126 70 ford blue 122 71 72 The min and average times for each make is generated by the command: 73 74 $ tsv-summarize --header --group-by Make --min Time --mean Time data.tsv 75 76 This produces: 77 78 Make Time_min Time_mean 79 ford 122 127 80 chevy 124 124 81 bmw 118 122 82 83 Using '--group-by Make,Color' will group by both 'Make' and 'Color'. 84 Omitting the '--group-by' entirely summarizes fields for the full file. 85 86 The previous example uses field names to identify fields. Field numbers 87 can be used as well. The next two commands are equivalent: 88 89 $ tsv-summarize -H --group-by Make,Color --min Time --mean Time data.tsv 90 $ tsv-summarize -H --group-by 1,2 --min 3 --mean 3 data.tsv 91 92 The program tries to generate useful headers, but custom headers can be 93 specified. Example (using -g and -H shortcuts for --header and --group-by): 94 95 $ tsv-summarize -H -g 1 --min 3:Fastest --mean 3:Average data.tsv 96 97 Most operators take custom headers in a similarly way, generally following: 98 99 --<operator-name> FIELD[:header] 100 101 Operators can be specified multiple times. They can also take multiple 102 fields (though not when a custom header is specified). Examples: 103 104 --median 2,3,4 105 --median 2-5,7-11 106 --median elapsed_time,system_time,user_time 107 --median '*_time' # Wildcard. All fields ending in '_time'. 108 109 The quantile operator requires one or more probabilities after the fields: 110 111 --quantile run_time:0.25 # Quantile 1 of the 'run_time' field 112 --quantile 2:0.25 # Quantile 1 of field 2 113 --quantile 2-4:0.25,0.5,0.75 # Q1, Median, Q3 of fields 2, 3, 4 114 115 Summarization operators available are: 116 count range mad values 117 retain sum var unique-values 118 first mean stddev unique-count 119 last median mode missing-count 120 min quantile mode-count not-missing-count 121 max 122 123 Calculated numeric values are printed to 12 significant digits by default. 124 This can be changed using the '--p|float-precision' option. If six or less 125 it sets the number of significant digits after the decimal point. If 126 greater than six it sets the total number of significant digits. 127 128 Calculations hold onto the minimum data needed while reading data. A few 129 operations like median keep all data values in memory. These operations will 130 start to encounter performance issues as available memory becomes scarce. The 131 size that can be handled effectively is machine dependent, but often quite 132 large files can be handled. 133 134 Operations requiring numeric entries will signal an error and terminate 135 processing if a non-numeric entry is found. 136 137 Missing values are not treated specially by default, this can be changed 138 using the '--x|exclude-missing' or '--r|replace-missing' option. The former 139 turns off processing for missing values, the latter uses a replacement value. 140 141 Options: 142 EOS"; 143 144 auto helpText = q"EOS 145 Synopsis: tsv-summarize [options] file [file...] 146 147 tsv-summarize runs aggregation operations on fields in tab-separated value 148 files. Operations can be run against the full input data or grouped by key 149 fields. Fields can be specified either by field number or field name. Use 150 '--help-verbose' for more detailed help. 151 152 Options: 153 EOS"; 154 155 /** Command line options - Container and processing. The processArgs method is used to 156 * process the command line. 157 */ 158 struct TsvSummarizeOptions { 159 import tsv_utils.common.utils : byLineSourceRange, ByLineSourceRange; 160 161 string programName; /// Program name 162 ByLineSourceRange!() inputSources; /// Input Files 163 size_t[] keyFields; /// -g, --group-by 164 bool hasHeader = false; /// --header 165 bool writeHeader = false; /// -w, --write-header 166 char inputFieldDelimiter = '\t'; /// --d|delimiter 167 char valuesDelimiter = '|'; /// --v|values-delimiter 168 size_t floatPrecision = 12; /// --p|float-precision 169 DList!Operator operators; /// Operators, in the order specified. 170 size_t endFieldIndex = 0; /// Derived value. Max field index used plus one. 171 MissingFieldPolicy globalMissingPolicy = new MissingFieldPolicy; /// Derived value. 172 173 /* tsv-summarize operators require access to the header line when the operator is 174 * created. This is because named fields may be used to describe fields names. To 175 * enable this, a CmdOptionHandler delegate is added to the cmdLinOperatorOptions 176 * array during during initial processing by std.getopt. The group-by operation is 177 * similar, but is added to the cmdLineOtherFieldOptions instead. At least one 178 * cmdLineOperatorOptions entry is required. 179 * 180 * The different handlers are defined after processArgs. 181 */ 182 183 /* CmdOptionHandler delegate signature - This is the call made to process the command 184 * line option arguments after the header line has been read. 185 */ 186 alias CmdOptionHandler = void delegate(bool hasHeader, string[] headerFields); 187 188 private CmdOptionHandler[] cmdLineOperatorOptions; 189 private CmdOptionHandler[] cmdLineOtherFieldOptions; 190 191 /* Returns a tuple. First value is true if command line arguments were successfully 192 * processed and execution should continue, or false if an error occurred or the user 193 * asked for help. If false, the second value is the appropriate exit code (0 or 1). 194 * 195 * Returning true (execution continues) means args have been validated and derived 196 * values calculated. In addition, field indices have been converted to zero-based. 197 */ 198 auto processArgs (ref string[] cmdArgs) { 199 import std.algorithm : any, each; 200 import std.getopt; 201 import std.path : baseName, stripExtension; 202 import std.typecons : Yes, No; 203 import tsv_utils.common.fieldlist : fieldListHelpText; 204 import tsv_utils.common.getopt_inorder; 205 import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix; 206 207 bool helpVerbose = false; // --help-verbose 208 bool helpFields = false; // --help-fields 209 bool versionWanted = false; // --V|version 210 bool excludeMissing = false; // --x|exclude-missing 211 string missingValueReplacement; // --r|replace-missing 212 213 214 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 215 216 try 217 { 218 arraySep = ","; // Use comma to separate values in command line options 219 auto r = getoptInorder( 220 cmdArgs, 221 "help-verbose", " Print full help.", &helpVerbose, 222 "help-fields", " Print help on specifying fields.", &helpFields, 223 224 std.getopt.config.caseSensitive, 225 "V|version", " Print version information and exit.", &versionWanted, 226 std.getopt.config.caseInsensitive, 227 228 "g|group-by", "<field-list> Fields to use as key.", &addGroupByOptionHandler, 229 230 std.getopt.config.caseSensitive, 231 "H|header", " Treat the first line of each file as a header.", &hasHeader, 232 std.getopt.config.caseInsensitive, 233 234 "w|write-header", " Write an output header even if there is no input header.", &writeHeader, 235 "d|delimiter", "CHR Field delimiter. Default: TAB. (Single byte UTF-8 characters only.)", &inputFieldDelimiter, 236 "v|values-delimiter", "CHR Values delimiter. Default: vertical bar (|). (Single byte UTF-8 characters only.)", &valuesDelimiter, 237 "p|float-precision", "NUM 'Precision' to use printing floating point numbers. Affects the number of digits printed and exponent use. Default: 12", &floatPrecision, 238 "x|exclude-missing", " Exclude missing (empty) fields from calculations.", &excludeMissing, 239 "r|replace-missing", "STR Replace missing (empty) fields with STR in calculations.", &missingValueReplacement, 240 "count", " Count occurrences of each unique key ('--g|group-by'), or the total number of records if no key field is specified.", &addCountOptionHandler, 241 "count-header", "STR Count occurrences of each unique key, like '--count', but use STR as the header.", &addCountHeaderOptionHandler, 242 "retain", "<field-list> Retain one copy of the field.", &addOperatorOptionHandler!RetainOperator, 243 "first", "<field-list>[:STR] First value seen.", &addOperatorOptionHandler!FirstOperator, 244 "last", "<field-list>[:STR] Last value seen.", &addOperatorOptionHandler!LastOperator, 245 "min", "<field-list>[:STR] Min value. (Fields with numeric values only.)", &addOperatorOptionHandler!MinOperator, 246 "max", "<field-list>[:STR] Max value. (Fields with numeric values only.)", &addOperatorOptionHandler!MaxOperator, 247 "range", "<field-list>[:STR] Difference between min and max values. (Fields with numeric values only.)", &addOperatorOptionHandler!RangeOperator, 248 "sum", "<field-list>[:STR] Sum of the values. (Fields with numeric values only.)", &addOperatorOptionHandler!SumOperator, 249 "mean", "<field-list>[:STR] Mean (average). (Fields with numeric values only.)", &addOperatorOptionHandler!MeanOperator, 250 "median", "<field-list>[:STR] Median value. (Fields with numeric values only. Reads all values into memory.)", &addOperatorOptionHandler!MedianOperator, 251 "quantile", "<field-list>:p[,p...][:STR] Quantiles. One or more fields, then one or more 0.0-1.0 probabilities. (Fields with numeric values only. Reads all values into memory.)", &addQuantileOperatorOptionHandler, 252 "mad", "<field-list>[:STR] Median absolute deviation from the median. Raw value, not scaled. (Fields with numeric values only. Reads all values into memory.)", &addOperatorOptionHandler!MadOperator, 253 "var", "<field-list>[:STR] Variance. (Sample variance, numeric fields only).", &addOperatorOptionHandler!VarianceOperator, 254 "stdev", "<field-list>[:STR] Standard deviation. (Sample st.dev, numeric fields only).", &addOperatorOptionHandler!StDevOperator, 255 "mode", "<field-list>[:STR] Mode. The most frequent value. (Reads all unique values into memory.)", &addOperatorOptionHandler!ModeOperator, 256 "mode-count", "<field-list>[:STR] Count of the most frequent value. (Reads all unique values into memory.)", &addOperatorOptionHandler!ModeCountOperator, 257 "unique-count", "<field-list>[:STR] Number of unique values. (Reads all unique values into memory.)", &addOperatorOptionHandler!UniqueCountOperator, 258 "missing-count", "<field-list>[:STR] Number of missing (empty) fields. Not affected by '--x|exclude-missing' or '--r|replace-missing'.", &addOperatorOptionHandler!MissingCountOperator, 259 "not-missing-count", "<field-list>[:STR] Number of filled (non-empty) fields. Not affected by '--r|replace-missing'.", &addOperatorOptionHandler!NotMissingCountOperator, 260 "values", "<field-list>[:STR] All the values, separated by --v|values-delimiter. (Reads all values into memory.)", &addOperatorOptionHandler!ValuesOperator, 261 "unique-values", "<field-list>[:STR] All the unique values, separated by --v|values-delimiter. (Reads all unique values into memory.)", &addOperatorOptionHandler!UniqueValuesOperator, 262 ); 263 264 if (r.helpWanted) 265 { 266 defaultGetoptPrinter(helpText, r.options); 267 return tuple(false, 0); 268 } 269 else if (helpVerbose) 270 { 271 defaultGetoptPrinter(helpTextVerbose, r.options); 272 return tuple(false, 0); 273 } 274 else if (helpFields) 275 { 276 writeln(fieldListHelpText); 277 return tuple(false, 0); 278 } 279 else if (versionWanted) 280 { 281 import tsv_utils.common.tsvutils_version; 282 writeln(tsvutilsVersionNotice("tsv-summarize")); 283 return tuple(false, 0); 284 } 285 286 /* Remaining command line args are files. Use standard input if files 287 * were not provided. Truncate cmdArgs to consume the arguments. 288 */ 289 string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 290 cmdArgs.length = 1; 291 292 /* Validation and derivations - Do as much validation prior to header line 293 * processing as possible (avoids waiting on stdin). 294 */ 295 296 enforce(!cmdLineOperatorOptions.empty, "At least one summary operator is required."); 297 298 enforce(inputFieldDelimiter != valuesDelimiter, 299 "Cannot use the same character for both --d|field-delimiter and --v|values-delimiter."); 300 301 enforce(!(excludeMissing && missingValueReplacement.length != 0), 302 "Cannot use both '--x|exclude-missing' and '--r|replace-missing'."); 303 304 /* Missing field policy. */ 305 globalMissingPolicy.updatePolicy(excludeMissing, missingValueReplacement); 306 307 string[] headerFields; 308 309 /* fieldListArgProcessing encapsulates the field list processing. It is 310 * called prior to reading the header line if headers are not being used, 311 * and after if headers are being used. 312 */ 313 void fieldListArgProcessing() 314 { 315 /* Run all the operator handlers. */ 316 cmdLineOtherFieldOptions.each!(dg => dg(hasHeader, headerFields)); 317 cmdLineOperatorOptions.each!(dg => dg(hasHeader, headerFields)); 318 319 /* keyFields need to be part of the endFieldIndex, which is one past 320 * the last field index. */ 321 keyFields.each!(delegate (size_t x) 322 { 323 if (x >= endFieldIndex) endFieldIndex = x + 1; 324 } ); 325 } 326 327 if (!hasHeader) fieldListArgProcessing(); 328 329 /* 330 * Create the byLineSourceRange and perform header line processing. 331 */ 332 inputSources = byLineSourceRange(filepaths); 333 334 335 if (hasHeader) 336 { 337 if (!inputSources.front.byLine.empty) 338 { 339 throwIfWindowsNewlineOnUnix(inputSources.front.byLine.front, inputSources.front.name, 1); 340 headerFields = inputSources.front.byLine.front.split(inputFieldDelimiter).to!(string[]); 341 } 342 343 fieldListArgProcessing(); 344 } 345 } 346 catch (Exception exc) 347 { 348 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 349 return tuple(false, 1); 350 } 351 return tuple(true, 0); 352 } 353 354 private void addGroupByOptionHandler(string option, string optionVal) 355 { 356 cmdLineOtherFieldOptions ~= 357 (bool hasHeader, string[] headerFields) 358 => groupByOptionHandler(hasHeader, headerFields, option, optionVal); 359 } 360 361 private void groupByOptionHandler(bool hasHeader, string[] headerFields, string option, string optionVal) 362 { 363 import tsv_utils.common.fieldlist; 364 365 try 366 { 367 keyFields = 368 optionVal 369 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex)(hasHeader, headerFields) 370 .array; 371 } 372 catch (Exception e) 373 { 374 e.msg = format("[--%s %s]. %s", option, optionVal, e.msg); 375 throw e; 376 } 377 } 378 379 private void addOperatorOptionHandler(OperatorClass : SingleFieldOperator)(string option, string optionVal) 380 { 381 cmdLineOperatorOptions ~= 382 (bool hasHeader, string[] headerFields) 383 => operatorOptionHandler!OperatorClass(hasHeader, headerFields, option, optionVal); 384 } 385 386 /* operationOptionHandler functions are callbacks that process command line options 387 * specifying summarization operations. eg. '--max 5', '--last 3:LastEntry'. Handlers 388 * check syntactic correctness and instantiate Operator objects that do the work. This 389 * is also where 1-upped field numbers are converted to 0-based indices. 390 */ 391 private void operatorOptionHandler(OperatorClass : SingleFieldOperator) 392 (bool hasHeader, string[] headerFields, string option, string optionVal) 393 { 394 import std.range : enumerate; 395 import std.typecons : Yes, No; 396 import tsv_utils.common.fieldlist; 397 398 try 399 { 400 auto optionValParse = 401 optionVal 402 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) 403 (hasHeader, headerFields); 404 405 auto fieldIndices = optionValParse.array; 406 bool hasOptionalHeader = optionVal.length > optionValParse.consumed; 407 string optionalHeader; 408 409 if (hasOptionalHeader) 410 { 411 enforce(fieldIndices.length <= 1, "Cannot specify a custom header when using multiple fields."); 412 enforce(optionVal.length - optionValParse.consumed > 1, 413 format("No value after field list.\n Expected: '--%s <field-list>' or '--%s <field>:<header>'.", 414 option, option)); 415 optionalHeader = optionVal[optionValParse.consumed + 1 .. $].idup; 416 } 417 418 foreach (fieldIndex; fieldIndices) 419 { 420 auto op = new OperatorClass(fieldIndex, globalMissingPolicy); 421 422 if (hasOptionalHeader) 423 { 424 enforce(op.allowCustomHeader, "Operator does not support custom headers."); 425 op.setCustomHeader(optionalHeader); 426 } 427 428 operators.insertBack(op); 429 if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1; 430 } 431 } 432 catch (Exception exc) 433 { 434 import std.format : format; 435 exc.msg = format("[--%s %s] %s", option, optionVal, exc.msg); 436 throw exc; 437 } 438 } 439 440 private void addQuantileOperatorOptionHandler(string option, string optionVal) 441 { 442 cmdLineOperatorOptions ~= 443 (bool hasHeader, string[] headerFields) 444 => quantileOperatorOptionHandler(hasHeader, headerFields, option, optionVal); 445 } 446 447 /* QuantileOperator has a different syntax and needs a custom command option handler. */ 448 private void quantileOperatorOptionHandler(bool hasHeader, string[] headerFields, string option, string optionVal) 449 { 450 import std.typecons : Yes, No; 451 import tsv_utils.common.fieldlist; 452 453 try 454 { 455 auto optionValParse = 456 optionVal 457 .parseFieldList!(size_t, Yes.convertToZeroBasedIndex, No.allowFieldNumZero, No.consumeEntireFieldListString) 458 (hasHeader, headerFields); 459 460 auto fieldIndices = optionValParse.array; 461 enforce(optionVal.length - optionValParse.consumed > 1, "No probabilities entered."); 462 463 auto splitRemaining = 464 optionVal[optionValParse.consumed + 1 .. $] 465 .findSplit(":"); 466 467 enforce(splitRemaining[1].empty || !splitRemaining[2].empty, 468 "Empty custom header."); 469 470 auto probStr = splitRemaining[0]; 471 auto header = splitRemaining[2]; 472 473 double[] probs; 474 475 foreach (str; probStr.splitter(',')) 476 { 477 double p = str.to!double; 478 enforce(p >= 0.0 && p <= 1.0, 479 format("Probability '%g' is not in the interval [0.0,1.0].", p)); 480 probs ~= p; 481 } 482 483 enforce(header.empty || (fieldIndices.length <= 1 && probs.length <= 1), 484 format("Cannot specify a custom header when using multiple fields or multiple probabilities.")); 485 486 assert (fieldIndices.length > 0); 487 assert (probs.length > 0); 488 assert (header.empty || (fieldIndices.length == 1 && probs.length == 1)); 489 490 foreach (fieldIndex; fieldIndices) 491 { 492 foreach (p; probs) 493 { 494 auto op = new QuantileOperator(fieldIndex, globalMissingPolicy, p); 495 if (!header.empty) op.setCustomHeader(header); 496 operators.insertBack(op); 497 } 498 if (fieldIndex >= endFieldIndex) endFieldIndex = fieldIndex + 1; 499 } 500 } 501 catch (Exception e) 502 { 503 e.msg = format( 504 "[--%s %s]. %s\n Expected: '--%s <field-list>:<prob>[,<prob>]' or '--%s <field>:<prob>:<header>' where <prob> is a number between 0.0 and 1.0.", 505 option, optionVal, e.msg, option, option); 506 throw e; 507 } 508 509 } 510 511 private void addCountOptionHandler() 512 { 513 cmdLineOperatorOptions ~= 514 (bool hasHeader, string[] headerFields) 515 => countOptionHandler(hasHeader, headerFields); 516 } 517 518 private void countOptionHandler(bool hasHeader, string[] headerFields) 519 { 520 operators.insertBack(new CountOperator()); 521 } 522 523 private void addCountHeaderOptionHandler(string option, string optionVal) 524 { 525 cmdLineOperatorOptions ~= 526 (bool hasHeader, string[] headerFields) 527 => countHeaderOptionHandler(hasHeader, headerFields, option, optionVal); 528 } 529 530 private void countHeaderOptionHandler(bool hasHeader, string[] headerFields, string option, string optionVal) 531 { 532 auto op = new CountOperator(); 533 op.setCustomHeader(optionVal); 534 operators.insertBack(op); 535 } 536 } 537 538 /** tsvSummarize does the primary work of the tsv-summarize program. 539 */ 540 void tsvSummarize(ref TsvSummarizeOptions cmdopt) 541 { 542 import tsv_utils.common.utils : BufferedOutputRange, ByLineSourceRange, 543 bufferedByLine, throwIfWindowsNewlineOnUnix; 544 545 /* Check that the input files were setup as expected. Should at least have one 546 * input, stdin if nothing else, and newlines removed from the byLine range. 547 */ 548 assert(!cmdopt.inputSources.empty); 549 static assert(is(typeof(cmdopt.inputSources) == ByLineSourceRange!(No.keepTerminator))); 550 551 /* BufferedOutputRange is faster than writing directly to stdout if many lines are 552 * being written. This will happen mostly when group-by is used. 553 */ 554 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 555 556 /* Pick the Summarizer based on the number of key-fields entered. */ 557 auto summarizer = 558 (cmdopt.keyFields.length == 0) 559 ? new NoKeySummarizer!(typeof(bufferedOutput))( 560 cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 561 562 : (cmdopt.keyFields.length == 1) 563 ? new OneKeySummarizer!(typeof(bufferedOutput))( 564 cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 565 566 : new MultiKeySummarizer!(typeof(bufferedOutput))( 567 cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy); 568 569 /* Add the operators to the Summarizer. */ 570 summarizer.setOperators(inputRangeObject(cmdopt.operators[])); 571 572 /* If there's no input header line, but writing an output header anyway, then 573 * write it now. This helps tasks further on in a unix pipeline detect errors 574 * quickly, without waiting for all the data to flow through the pipeline. 575 */ 576 auto printOptions = SummarizerPrintOptions( 577 cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision); 578 579 if (!cmdopt.hasHeader && cmdopt.writeHeader) 580 { 581 summarizer.writeSummaryHeader(bufferedOutput, printOptions); 582 bufferedOutput.flush; 583 } 584 585 /* Process each input file, one line at a time. */ 586 auto lineFields = new char[][](cmdopt.endFieldIndex); 587 bool headerFound = false; 588 foreach (inputStream; cmdopt.inputSources) 589 { 590 foreach (lineNum, line; inputStream.byLine.enumerate(1)) 591 { 592 if (lineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, lineNum); 593 594 /* Copy the needed number of fields to the fields array. 595 * Note: The number is zero if no operator needs fields. Notably, the count 596 * operator. Used by itself, it counts the number input lines (ala 'wc -l'). 597 */ 598 if (cmdopt.endFieldIndex > 0) 599 { 600 size_t fieldIndex = 0; 601 foreach (fieldValue; line.splitter(cmdopt.inputFieldDelimiter)) 602 { 603 if (fieldIndex == cmdopt.endFieldIndex) break; 604 lineFields[fieldIndex] = fieldValue; 605 fieldIndex++; 606 } 607 608 if (fieldIndex == 0) 609 { 610 assert(cmdopt.endFieldIndex > 0); 611 assert(line.length == 0); 612 613 /* Bug work-around. Empty lines are not handled properly by splitter. 614 * - Bug: https://issues.dlang.org/show_bug.cgi?id=15735 615 * - Pull Request: https://github.com/D-Programming-Language/phobos/pull/4030 616 * This can arise for: '$ tsv-summarize -g 1 --count'. This counts the 617 * unique values in field 1. If there's only one column, then an empty 618 * line becomes an empty string for field 1. Work-around: Point to the 619 * line. It's an empty string. 620 */ 621 lineFields[fieldIndex] = line; 622 fieldIndex++; 623 } 624 625 enforce(fieldIndex >= cmdopt.endFieldIndex, 626 format("Not enough fields in line. File: %s, Line: %s", 627 inputStream.name, lineNum)); 628 } 629 630 if (cmdopt.hasHeader && lineNum == 1) 631 { 632 if (!headerFound) 633 { 634 summarizer.processHeaderLine(lineFields); 635 headerFound = true; 636 637 /* Write the header now. This helps tasks further on in a unix 638 * pipeline detect errors quickly, without waiting for all the 639 * data to flow through the pipeline. Note that an upstream task 640 * may have flushed its header line, so the header may arrive 641 * long before the main block of data. 642 */ 643 summarizer.writeSummaryHeader(bufferedOutput, printOptions); 644 bufferedOutput.flush; 645 } 646 } 647 else 648 { 649 /* Process the line. Processing will fail (throw) if a field cannot be 650 * converted to the expected type. 651 */ 652 try summarizer.processNextLine(lineFields); 653 catch (Exception exc) 654 { 655 throw new Exception( 656 format("Could not process line or field: %s\n File: %s Line: %s%s", 657 exc.msg, inputStream.name, lineNum, 658 (lineNum == 1) ? "\n Is this a header line? Use --header to skip." : "")); 659 } 660 } 661 } 662 } 663 664 debug writeln("[tsvSummarize] After reading all data."); 665 666 /* Whew! We're done processing input data. Run the calculations and print. */ 667 668 summarizer.writeSummaryBody(bufferedOutput, printOptions); 669 } 670 671 /** The default field header. This is used when the input doesn't have field headers, 672 * but field headers are used in the output. The default is "fieldN", where N is the 673 * 1-upped field number. 674 */ 675 string fieldHeaderFromIndex(size_t fieldIndex) 676 { 677 enum prefix = "field"; 678 return prefix ~ (fieldIndex + 1).to!string; 679 } 680 681 unittest 682 { 683 assert(fieldHeaderFromIndex(0) == "field1"); 684 assert(fieldHeaderFromIndex(10) == "field11"); 685 } 686 687 /** Produce a summary header from a field header. 688 * 689 * The result has the form `<fieldHeader>_<operation>`. e.g. If the field header is 690 * "length" and the operation is "max", the summary header is "length_max". The field 691 * header typically comes a header line in the input data or was constructed by 692 * fieldHeaderFromIndex(). 693 * 694 * If operationName is the empty string, then fieldHeader is used unchanged. This supports 695 * the Retain operator. 696 */ 697 string summaryHeaderFromFieldHeader(string fieldHeader, string operationName) 698 { 699 return (operationName.length > 0) ? fieldHeader ~ "_" ~ operationName : fieldHeader; 700 } 701 702 unittest 703 { 704 assert(summaryHeaderFromFieldHeader("originalfield", "mycalc") == "originalfield_mycalc"); 705 assert(summaryHeaderFromFieldHeader("originalfield", "") == "originalfield"); 706 } 707 708 /** SummarizerPrintOptions holds printing options for Summarizers and Calculators. Typically 709 * specified with command line options, it is separated out for modularity. 710 */ 711 struct SummarizerPrintOptions 712 { 713 char fieldDelimiter; 714 char valuesDelimiter; 715 size_t floatPrecision = 12; 716 717 import std.traits : isFloatingPoint, isIntegral; 718 719 auto formatNumber(T)(T n) const 720 if (isFloatingPoint!T || isIntegral!T) 721 { 722 import tsv_utils.common.numerics : formatNumber; 723 return formatNumber!T(n, floatPrecision); 724 } 725 } 726 727 /** A Summarizer object maintains the state of the summarization and performs basic 728 * processing. Handling of files and input lines is left to the caller. 729 * 730 * Classes supporting the Summarizer must implement the methods: 731 * - setOperators - Called after initializing the object for each operator to be processed. 732 * - processHeaderLine - Called to process the header line of each file. Returns true if 733 * it was the first header line processed (used when reading multiple files). 734 * - processNextLine - Called to process non-header lines. 735 * - writeSummaryHeader - Called to write the header line. 736 * - writeSummaryBody - Called to write the result lines. 737 * 738 */ 739 interface Summarizer(OutputRange) 740 { 741 /** Called after initializing the object for each operator to be processed. */ 742 void setOperators(InputRange!Operator op); 743 744 /** Called to process the header line of each file. Returns true if it was the 745 * first header line processed (used when reading multiple files). 746 */ 747 bool processHeaderLine(const char[][] lineFields); 748 749 /** Called to process non-header lines. */ 750 void processNextLine(const char[][] lineFields); 751 752 /** Called to write the header line. */ 753 void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions); 754 755 /** Called to write the result lines. */ 756 void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions); 757 } 758 759 /** SummarizerBase performs work shared by all sumarizers, most everything except for 760 * handling of unique keys. 761 * 762 * The base class handles creation, allocates storage for Operators and SharedFieldValues, 763 * and similar. Derived classes deal primarily with unique keys and the associated Calculators 764 * and UniqueKeyValuesLists. 765 */ 766 class SummarizerBase(OutputRange) : Summarizer!OutputRange 767 { 768 private char _inputFieldDelimiter; 769 private bool _hasProcessedFirstHeaderLine = false; 770 private SharedFieldValues _sharedFieldValues = null; // Null if no shared field value lists. 771 protected MissingFieldPolicy _missingPolicy; 772 protected DList!Operator _operators; 773 protected size_t _numOperators = 0; 774 775 this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 776 { 777 _inputFieldDelimiter = inputFieldDelimiter; 778 _missingPolicy = missingPolicy; 779 } 780 781 char inputFieldDelimiter() const @property 782 { 783 return _inputFieldDelimiter; 784 } 785 786 /** Sets the Operators used by the Summarizer. Called after construction. */ 787 void setOperators(InputRange!Operator operators) 788 { 789 foreach (op; operators) 790 { 791 _operators.insertBack(op); 792 _numOperators++; 793 auto numericFieldsToSave = op.numericFieldsToSave(); 794 auto textFieldsToSave = op.textFieldsToSave(); 795 796 if (numericFieldsToSave.length > 0 || textFieldsToSave.length > 0) 797 { 798 if (_sharedFieldValues is null) 799 { 800 _sharedFieldValues = new SharedFieldValues(); 801 } 802 numericFieldsToSave.each!(x => _sharedFieldValues.addNumericIndex(x)); 803 textFieldsToSave.each!(x => _sharedFieldValues.addTextIndex(x)); 804 } 805 } 806 } 807 808 /** Called to process the header line of each file. Returns true if it was the 809 * first header line processed (used when reading multiple files). 810 */ 811 bool processHeaderLine(const char[][] lineFields) 812 { 813 if (!_hasProcessedFirstHeaderLine) 814 { 815 _operators.each!(x => x.processHeaderLine(lineFields)); 816 _hasProcessedFirstHeaderLine = true; 817 return true; 818 } 819 else 820 { 821 return false; 822 } 823 } 824 825 protected final UniqueKeyValuesLists makeUniqueKeyValuesLists() 826 { 827 return (_sharedFieldValues is null) 828 ? null 829 : _sharedFieldValues.makeUniqueKeyValuesLists; 830 } 831 832 abstract void processNextLine(const char[][] lineFields); 833 abstract void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions); 834 abstract void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions); 835 } 836 837 /** The NoKeySummarizer is used when summarizing values across the entire input. 838 * 839 * Note: NoKeySummarizer is used in Operator unit tests and gets extensive testing 840 * through that mechanism. 841 */ 842 final class NoKeySummarizer(OutputRange) : SummarizerBase!OutputRange 843 { 844 private Calculator[] _calculators; 845 private UniqueKeyValuesLists _valueLists; 846 847 this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 848 { 849 super(inputFieldDelimiter, missingPolicy); 850 } 851 852 /** Called after initializing the object for each operator to be processed. */ 853 override void setOperators(InputRange!Operator operators) 854 { 855 super.setOperators(operators); 856 857 /* Only one Calculator per Operation, so create them as Operators are added. */ 858 foreach (op; operators) _calculators ~= op.makeCalculator; 859 _valueLists = super.makeUniqueKeyValuesLists(); 860 } 861 862 /** Called to process non-header lines. */ 863 override void processNextLine(const char[][] lineFields) 864 { 865 _calculators.each!(x => x.processNextLine(lineFields)); 866 if (_valueLists !is null) _valueLists.processNextLine(lineFields, _missingPolicy); 867 } 868 869 /** Called to write the header line. */ 870 override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 871 { 872 put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter)); 873 put(outputStream, '\n'); 874 } 875 876 /** Called to write the result lines. */ 877 override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 878 { 879 put(outputStream, 880 _calculators[] 881 .map!(x => x.calculate(_valueLists, printOptions)) 882 .join(printOptions.fieldDelimiter)); 883 put(outputStream, '\n'); 884 } 885 } 886 887 /** KeySummarizerBase does work shared by the single key and multi-key summarizers. 888 * 889 * The primary difference between those two is the formation of the key. The primary 890 * reason for separating those into two separate classes is to simplify (speed-up) 891 * handling of single field keys, which are the most common use case. 892 */ 893 class KeySummarizerBase(OutputRange) : SummarizerBase!OutputRange 894 { 895 protected struct UniqueKeyData 896 { 897 Calculator[] calculators; 898 UniqueKeyValuesLists valuesLists; 899 } 900 901 private DList!string _uniqueKeys; 902 private UniqueKeyData[string] _uniqueKeyData; 903 904 this(const char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 905 { 906 super(inputFieldDelimiter, missingPolicy); 907 } 908 909 protected void processNextLineWithKey(T : const char[])(T key, const char[][] lineFields) 910 { 911 debug writefln("[%s]: %s", __FUNCTION__, lineFields.to!string); 912 913 auto dataPtr = (key in _uniqueKeyData); 914 auto data = (dataPtr is null) ? addUniqueKey(key.to!string) : *dataPtr; 915 916 data.calculators.each!(x => x.processNextLine(lineFields)); 917 if (data.valuesLists !is null) data.valuesLists.processNextLine(lineFields, _missingPolicy); 918 } 919 920 protected UniqueKeyData addUniqueKey(string key) 921 { 922 assert(key !in _uniqueKeyData); 923 924 _uniqueKeys.insertBack(key); 925 926 auto calculators = new Calculator[_numOperators]; 927 size_t i = 0; 928 foreach (op; _operators) 929 { 930 calculators[i] = op.makeCalculator; 931 i++; 932 } 933 934 return _uniqueKeyData[key] = UniqueKeyData(calculators, super.makeUniqueKeyValuesLists()); 935 } 936 937 override void writeSummaryHeader(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 938 { 939 put(outputStream, keyFieldHeader()); 940 put(outputStream, printOptions.fieldDelimiter); 941 put(outputStream, _operators[].map!(op => op.header).join(printOptions.fieldDelimiter)); 942 put(outputStream, '\n'); 943 } 944 945 override void writeSummaryBody(ref OutputRange outputStream, const ref SummarizerPrintOptions printOptions) 946 { 947 foreach(key; _uniqueKeys) 948 { 949 auto data = _uniqueKeyData[key]; 950 put(outputStream, key); 951 put(outputStream, printOptions.fieldDelimiter); 952 put(outputStream, 953 data.calculators[] 954 .map!(x => x.calculate(data.valuesLists, printOptions)) 955 .join(printOptions.fieldDelimiter)); 956 put(outputStream, '\n'); 957 } 958 } 959 960 abstract string keyFieldHeader() const @property; 961 } 962 963 /** This Summarizer is for the case where the unique key is based on exactly one field. 964 */ 965 final class OneKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange 966 { 967 private size_t _keyFieldIndex = 0; 968 private string _keyFieldHeader; 969 private DList!string _uniqueKeys; 970 971 this(size_t keyFieldIndex, char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 972 { 973 super(inputFieldDelimiter, missingPolicy); 974 _keyFieldIndex = keyFieldIndex; 975 _keyFieldHeader = fieldHeaderFromIndex(keyFieldIndex); 976 } 977 978 override string keyFieldHeader() const @property 979 { 980 return _keyFieldHeader; 981 } 982 983 override bool processHeaderLine(const char[][] lineFields) 984 { 985 assert(_keyFieldIndex <= lineFields.length); 986 987 bool isFirstHeaderLine = super.processHeaderLine(lineFields); 988 if (isFirstHeaderLine) 989 { 990 _keyFieldHeader = lineFields[_keyFieldIndex].to!string; 991 } 992 return isFirstHeaderLine; 993 } 994 995 override void processNextLine(const char[][] lineFields) 996 { 997 assert(_keyFieldIndex < lineFields.length); 998 processNextLineWithKey(lineFields[_keyFieldIndex], lineFields); 999 } 1000 } 1001 1002 /** This Summarizer is for the case where the unique key is based on multiple fields. 1003 */ 1004 final class MultiKeySummarizer(OutputRange) : KeySummarizerBase!OutputRange 1005 { 1006 private size_t[] _keyFieldIndices; 1007 private string _keyFieldHeader; 1008 private DList!string _uniqueKeys; 1009 1010 this(const size_t[] keyFieldIndices, char inputFieldDelimiter, MissingFieldPolicy missingPolicy) 1011 { 1012 super(inputFieldDelimiter, missingPolicy); 1013 _keyFieldIndices = keyFieldIndices.dup; 1014 _keyFieldHeader = 1015 _keyFieldIndices.map!(i => fieldHeaderFromIndex(i)) 1016 .join(inputFieldDelimiter); 1017 } 1018 1019 override string keyFieldHeader() const @property 1020 { 1021 return _keyFieldHeader; 1022 } 1023 1024 override bool processHeaderLine(const char[][] lineFields) 1025 { 1026 assert(_keyFieldIndices.all!(x => x < lineFields.length)); 1027 assert(_keyFieldIndices.length >= 2); 1028 1029 bool isFirstHeaderLine = super.processHeaderLine(lineFields); 1030 if (isFirstHeaderLine) 1031 { 1032 _keyFieldHeader = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string; 1033 } 1034 return isFirstHeaderLine; 1035 } 1036 1037 override void processNextLine(const char[][] lineFields) 1038 { 1039 assert(_keyFieldIndices.all!(x => x < lineFields.length)); 1040 assert(_keyFieldIndices.length >= 2); 1041 1042 string key = _keyFieldIndices.map!(i => lineFields[i]).join(inputFieldDelimiter).to!string; 1043 processNextLineWithKey(key, lineFields); 1044 } 1045 } 1046 1047 version(unittest) 1048 { 1049 /* testSummarizer is a helper that can run many types of unit tests against 1050 * Summarizers. It can also test operators, but there are separate helper functions 1051 * better suited for that purpose. 1052 * 1053 * Arguments are a command line args, an input file, and expected output. The 1054 * input file and expected output are already split into lines and fields, the helper 1055 * manages re-assembly. The program name from the command line args is printed if an 1056 * an error occurs, it is useful to identify the test that failed. 1057 * 1058 * Note: Much of this is a duplication tsvSummarize logic. Better abstraction of 1059 * file input/output would enable running unit tests directly on top of tsvSummarize. 1060 * 1061 * Update (April 2020): With the introduction of InputSourceRange and ByLineSource, 1062 * there needs to be a physical file when call processArgs. Its hard to get around, 1063 * as the intent is to read the header line of the first input file during command 1064 * line argument processing. Eventually this unit test process will need to be 1065 * rewritten. For now, a file with the equivalent data is being added to the command 1066 * line. 1067 */ 1068 void testSummarizer(string[] cmdArgs, string[][] file, string[][] expected) 1069 { 1070 import std.array : appender; 1071 1072 assert(cmdArgs.length > 0, "[testSummarizer] cmdArgs must not be empty."); 1073 1074 auto formatAssertMessage(T...)(string msg, T formatArgs) 1075 { 1076 auto formatString = "[testSummarizer] %s: " ~ msg; 1077 return format(formatString, cmdArgs[0], formatArgs); 1078 } 1079 1080 TsvSummarizeOptions cmdopt; 1081 auto savedCmdArgs = cmdArgs.to!string; 1082 auto r = cmdopt.processArgs(cmdArgs); 1083 assert(r[0], formatAssertMessage("Invalid command line args: '%s'.", savedCmdArgs)); 1084 1085 assert(file.all!(line => line.length >= cmdopt.endFieldIndex), 1086 formatAssertMessage("group-by or operator field number greater than number of fields a line of the input file.")); 1087 1088 /* Pick the Summarizer based on the number of key-fields entered. */ 1089 auto summarizer = 1090 (cmdopt.keyFields.length == 0) 1091 ? new NoKeySummarizer!(typeof(appender!(char[])()))( 1092 cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 1093 1094 : (cmdopt.keyFields.length == 1) 1095 ? new OneKeySummarizer!(typeof(appender!(char[])()))( 1096 cmdopt.keyFields[0], cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy) 1097 1098 : new MultiKeySummarizer!(typeof(appender!(char[])()))( 1099 cmdopt.keyFields, cmdopt.inputFieldDelimiter, cmdopt.globalMissingPolicy); 1100 1101 /* Add the operators to the Summarizer. */ 1102 summarizer.setOperators(inputRangeObject(cmdopt.operators[])); 1103 1104 /* Process the file one line at a time. */ 1105 auto lineFields = new char[][](cmdopt.endFieldIndex); 1106 bool headerFound = false; 1107 foreach (lineNum, line; file.enumerate(1)) 1108 { 1109 /* Copy the needed fields to the fields array. */ 1110 foreach (i, val; line[0..cmdopt.endFieldIndex]) lineFields[i] = val.dup; 1111 1112 if (cmdopt.hasHeader && lineNum == 1) 1113 { 1114 if (!headerFound) 1115 { 1116 summarizer.processHeaderLine(lineFields); 1117 headerFound = true; 1118 } 1119 } 1120 else 1121 { 1122 try summarizer.processNextLine(lineFields); 1123 catch (Exception exc) 1124 { 1125 assert(false, formatAssertMessage(exc.msg)); 1126 } 1127 } 1128 } 1129 auto printOptions = SummarizerPrintOptions( 1130 cmdopt.inputFieldDelimiter, cmdopt.valuesDelimiter, cmdopt.floatPrecision); 1131 1132 auto summarizerOutput = appender!(char[])(); 1133 1134 if (cmdopt.hasHeader || cmdopt.writeHeader) 1135 { 1136 summarizer.writeSummaryHeader(summarizerOutput, printOptions); 1137 } 1138 1139 summarizer.writeSummaryBody(summarizerOutput, printOptions); 1140 auto expectedOutput = expected.map!(x => x.joiner(cmdopt.inputFieldDelimiter.to!string)).joiner("\n").to!string; 1141 if (expectedOutput.length > 0 && expectedOutput[$-1] != '\n') expectedOutput ~= "\n"; 1142 1143 assert(summarizerOutput.data == expectedOutput, 1144 formatAssertMessage( 1145 "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================", 1146 expectedOutput.to!string, summarizerOutput.data.to!string)); 1147 } 1148 1149 void writeDataFile(string filepath, string[][] fileData, string delimiter = "\t") 1150 { 1151 import std.algorithm; 1152 import std.stdio; 1153 1154 auto f = filepath.File("w"); 1155 foreach (record; fileData) f.writeln(record.joiner(delimiter)); 1156 f.close; 1157 } 1158 } 1159 1160 unittest 1161 { 1162 import tsv_utils.common.unittest_utils; // tsv unit test helpers, from common/src/. 1163 import std.file : mkdir, rmdirRecurse; 1164 import std.path : buildPath; 1165 1166 auto testDir = makeUnittestTempDir("tsv_summarizer"); 1167 scope(exit) testDir.rmdirRecurse; 1168 1169 /* Summarizer unit tests. Primarily single-key and multi-key summarizers. To a limited 1170 * extent, command line option handling (TsvSummarizeOptions). Individual operators 1171 * have separate tests, those tests test the no-key summarizer. The Values operator is 1172 * used in these tests. It engages a number of behaviors, and the results have limited 1173 * ambiguity. Using only one operator limits dependence on individual operators. 1174 * 1175 * Update (April 2020): There now needs to be a real file passed to testSummarizer. 1176 * See the comments with testSummarizer for details. 1177 */ 1178 1179 auto file1 = [["fld1", "fld2", "fld3"], 1180 ["a", "a", "3"], 1181 ["c", "a", "2b"], 1182 ["c", "bc", ""], 1183 ["a", "c", "2b"], 1184 ["", "bc", ""], 1185 ["c", "bc", "3"]]; 1186 1187 auto file1Path = buildPath(testDir, "file1.tsv"); 1188 auto file1NoHeaderPath = buildPath(testDir, "file1_noheader.tsv"); 1189 writeDataFile(file1Path, file1); 1190 writeDataFile(file1NoHeaderPath, file1[1 .. $]); 1191 1192 /* Single-key summarizer tests. 1193 */ 1194 testSummarizer(["unittest-sk-1", "--header", "--group-by", "1", "--values", "1", file1Path], 1195 file1, 1196 [["fld1", "fld1_values"], 1197 ["a", "a|a"], 1198 ["c", "c|c|c"], 1199 ["", ""]] 1200 ); 1201 testSummarizer(["unittest-sk-1-named", "--header", "--group-by", "fld1", "--values", "fld1", file1Path], 1202 file1, 1203 [["fld1", "fld1_values"], 1204 ["a", "a|a"], 1205 ["c", "c|c|c"], 1206 ["", ""]] 1207 ); 1208 testSummarizer(["unittest-sk-2", "-H", "--group-by", "1", "--values", "2", file1Path], 1209 file1, 1210 [["fld1", "fld2_values"], 1211 ["a", "a|c"], 1212 ["c", "a|bc|bc"], 1213 ["", "bc"]] 1214 ); 1215 testSummarizer(["unittest-sk-2-named", "-H", "--group-by", "fld1", "--values", "fld2", file1Path], 1216 file1, 1217 [["fld1", "fld2_values"], 1218 ["a", "a|c"], 1219 ["c", "a|bc|bc"], 1220 ["", "bc"]] 1221 ); 1222 testSummarizer(["unittest-sk-3", "-H", "-g", "1", "--values", "3", file1Path], 1223 file1, 1224 [["fld1", "fld3_values"], 1225 ["a", "3|2b"], 1226 ["c", "2b||3"], 1227 ["", ""]] 1228 ); 1229 testSummarizer(["unittest-sk-4", "-H", "--group-by", "1", "--values", "1,2,3", file1Path], 1230 file1, 1231 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1232 ["a", "a|a", "a|c", "3|2b"], 1233 ["c", "c|c|c", "a|bc|bc", "2b||3"], 1234 ["", "", "bc", ""]] 1235 ); 1236 testSummarizer(["unittest-sk-4-named-a", "-H", "--group-by", "fld1", "--values", "fld1,fld2,fld3", file1Path], 1237 file1, 1238 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1239 ["a", "a|a", "a|c", "3|2b"], 1240 ["c", "c|c|c", "a|bc|bc", "2b||3"], 1241 ["", "", "bc", ""]] 1242 ); 1243 testSummarizer(["unittest-sk-4-named-b", "-H", "--group-by", "fld1", "--values", "fld*", file1Path], 1244 file1, 1245 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1246 ["a", "a|a", "a|c", "3|2b"], 1247 ["c", "c|c|c", "a|bc|bc", "2b||3"], 1248 ["", "", "bc", ""]] 1249 ); 1250 testSummarizer(["unittest-sk-5", "-H", "--group-by", "1", "--values", "1-3", file1Path], 1251 file1, 1252 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1253 ["a", "a|a", "a|c", "3|2b"], 1254 ["c", "c|c|c", "a|bc|bc", "2b||3"], 1255 ["", "", "bc", ""]] 1256 ); 1257 testSummarizer(["unittest-sk-6", "-H", "--group-by", "1", "--values", "3,2,1", file1Path], 1258 file1, 1259 [["fld1", "fld3_values", "fld2_values", "fld1_values"], 1260 ["a", "3|2b", "a|c", "a|a"], 1261 ["c", "2b||3", "a|bc|bc", "c|c|c"], 1262 ["", "", "bc", ""]] 1263 ); 1264 testSummarizer(["unittest-sk-7", "-H", "--group-by", "1", "--values", "3-1", file1Path], 1265 file1, 1266 [["fld1", "fld3_values", "fld2_values", "fld1_values"], 1267 ["a", "3|2b", "a|c", "a|a"], 1268 ["c", "2b||3", "a|bc|bc", "c|c|c"], 1269 ["", "", "bc", ""]] 1270 ); 1271 testSummarizer(["unittest-sk-8", "-H", "--group-by", "2", "--values", "1", file1Path], 1272 file1, 1273 [["fld2", "fld1_values"], 1274 ["a", "a|c"], 1275 ["bc", "c||c"], 1276 ["c", "a"]] 1277 ); 1278 testSummarizer(["unittest-sk-9", "-H", "--group-by", "2", "--values", "2", file1Path], 1279 file1, 1280 [["fld2", "fld2_values"], 1281 ["a", "a|a"], 1282 ["bc", "bc|bc|bc"], 1283 ["c", "c"]] 1284 ); 1285 testSummarizer(["unittest-sk-10", "-H", "--group-by", "2", "--values", "3", file1Path], 1286 file1, 1287 [["fld2", "fld3_values"], 1288 ["a", "3|2b"], 1289 ["bc", "||3"], 1290 ["c", "2b"]] 1291 ); 1292 testSummarizer(["unittest-sk-11", "-H", "--group-by", "2", "--values", "1,3", file1Path], 1293 file1, 1294 [["fld2", "fld1_values", "fld3_values"], 1295 ["a", "a|c", "3|2b"], 1296 ["bc", "c||c", "||3"], 1297 ["c", "a", "2b"]] 1298 ); 1299 testSummarizer(["unittest-sk-12", "-H", "--group-by", "2", "--values", "3,1", file1Path], 1300 file1, 1301 [["fld2", "fld3_values", "fld1_values"], 1302 ["a", "3|2b", "a|c"], 1303 ["bc", "||3", "c||c"], 1304 ["c", "2b", "a"]] 1305 ); 1306 testSummarizer(["unittest-sk-13", "-H", "--group-by", "3", "--values", "1", file1Path], 1307 file1, 1308 [["fld3", "fld1_values"], 1309 ["3", "a|c"], 1310 ["2b", "c|a"], 1311 ["", "c|"]] 1312 ); 1313 testSummarizer(["unittest-sk-14", "-H", "--group-by", "3", "--values", "2", file1Path], 1314 file1, 1315 [["fld3", "fld2_values"], 1316 ["3", "a|bc"], 1317 ["2b", "a|c"], 1318 ["", "bc|bc"]] 1319 ); 1320 testSummarizer(["unittest-sk-15", "-H", "--group-by", "3", "--values", "1,2", file1Path], 1321 file1, 1322 [["fld3", "fld1_values", "fld2_values"], 1323 ["3", "a|c", "a|bc"], 1324 ["2b", "c|a", "a|c"], 1325 ["", "c|", "bc|bc"]] 1326 ); 1327 testSummarizer(["unittest-sk-15-named", "-H", "--group-by", "fld3", "--values", "fld1,fld2", file1Path], 1328 file1, 1329 [["fld3", "fld1_values", "fld2_values"], 1330 ["3", "a|c", "a|bc"], 1331 ["2b", "c|a", "a|c"], 1332 ["", "c|", "bc|bc"]] 1333 ); 1334 1335 /* Multi-key summarizer tests. 1336 */ 1337 testSummarizer(["unittest-mk-1", "--header", "--group-by", "1,2", "--values", "1", file1Path], 1338 file1, 1339 [["fld1", "fld2", "fld1_values"], 1340 ["a", "a", "a"], 1341 ["c", "a", "c"], 1342 ["c", "bc", "c|c"], 1343 ["a", "c", "a"], 1344 ["", "bc", ""]] 1345 ); 1346 testSummarizer(["unittest-mk-2", "-H", "--group-by", "1,2", "--values", "2", file1Path], 1347 file1, 1348 [["fld1", "fld2", "fld2_values"], 1349 ["a", "a", "a"], 1350 ["c", "a", "a"], 1351 ["c", "bc", "bc|bc"], 1352 ["a", "c", "c"], 1353 ["", "bc", "bc"]] 1354 ); 1355 testSummarizer(["unittest-mk-3", "-H", "--group-by", "1,2", "--values", "3", file1Path], 1356 file1, 1357 [["fld1", "fld2", "fld3_values"], 1358 ["a", "a", "3"], 1359 ["c", "a", "2b"], 1360 ["c", "bc", "|3"], 1361 ["a", "c", "2b"], 1362 ["", "bc", ""]] 1363 ); 1364 testSummarizer(["unittest-mk-4", "-H", "--group-by", "1,2", "--values", "3,1", file1Path], 1365 file1, 1366 [["fld1", "fld2", "fld3_values", "fld1_values"], 1367 ["a", "a", "3", "a"], 1368 ["c", "a", "2b", "c"], 1369 ["c", "bc", "|3", "c|c"], 1370 ["a", "c", "2b", "a"], 1371 ["", "bc", "", ""]] 1372 ); 1373 testSummarizer(["unittest-mk-4-named", "-H", "--group-by", "fld1,fld2", "--values", "fld3,fld1", file1Path], 1374 file1, 1375 [["fld1", "fld2", "fld3_values", "fld1_values"], 1376 ["a", "a", "3", "a"], 1377 ["c", "a", "2b", "c"], 1378 ["c", "bc", "|3", "c|c"], 1379 ["a", "c", "2b", "a"], 1380 ["", "bc", "", ""]] 1381 ); 1382 testSummarizer(["unittest-mk-5", "-H", "--group-by", "3,2", "--values", "1", file1Path], 1383 file1, 1384 [["fld3", "fld2", "fld1_values"], 1385 ["3", "a", "a"], 1386 ["2b", "a", "c"], 1387 ["", "bc", "c|"], 1388 ["2b", "c", "a"], 1389 ["3", "bc", "c"]] 1390 ); 1391 testSummarizer(["unittest-mk-6", "-H", "--group-by", "3-2", "--values", "1", file1Path], 1392 file1, 1393 [["fld3", "fld2", "fld1_values"], 1394 ["3", "a", "a"], 1395 ["2b", "a", "c"], 1396 ["", "bc", "c|"], 1397 ["2b", "c", "a"], 1398 ["3", "bc", "c"]] 1399 ); 1400 testSummarizer(["unittest-mk-7", "-H", "--group-by", "2,1,3", "--values", "2", file1Path], 1401 file1, 1402 [["fld2", "fld1", "fld3", "fld2_values"], 1403 ["a", "a", "3", "a"], 1404 ["a", "c", "2b", "a"], 1405 ["bc", "c", "", "bc"], 1406 ["c", "a", "2b", "c"], 1407 ["bc", "", "", "bc"], 1408 ["bc", "c", "3", "bc"]] 1409 ); 1410 1411 /* Missing policies. */ 1412 testSummarizer(["unittest-mis-1", "--header", "--group-by", "1", "--values", "1", "--exclude-missing", file1Path], 1413 file1, 1414 [["fld1", "fld1_values"], 1415 ["a", "a|a"], 1416 ["c", "c|c|c"], 1417 ["", ""]] 1418 ); 1419 testSummarizer(["unittest-mis-2", "-H", "--group-by", "1", "--values", "2", "-x", file1Path], 1420 file1, 1421 [["fld1", "fld2_values"], 1422 ["a", "a|c"], 1423 ["c", "a|bc|bc"], 1424 ["", "bc"]] 1425 ); 1426 testSummarizer(["unittest-mis-3", "-H", "-g", "1", "--values", "3", "-x", file1Path], 1427 file1, 1428 [["fld1", "fld3_values"], 1429 ["a", "3|2b"], 1430 ["c", "2b|3"], 1431 ["", ""]] 1432 ); 1433 testSummarizer(["unittest-mis-4", "-H", "--group-by", "1", "--values", "1,2,3", "-x", file1Path], 1434 file1, 1435 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1436 ["a", "a|a", "a|c", "3|2b"], 1437 ["c", "c|c|c", "a|bc|bc", "2b|3"], 1438 ["", "", "bc", ""]] 1439 ); 1440 testSummarizer(["unittest-mis-5", "--header", "--group-by", "1", "--values", "1", "--replace-missing", "NA", file1Path], 1441 file1, 1442 [["fld1", "fld1_values"], 1443 ["a", "a|a"], 1444 ["c", "c|c|c"], 1445 ["", "NA"]] 1446 ); 1447 testSummarizer(["unittest-mis-6", "-H", "--group-by", "1", "--values", "2", "-r", "NA", file1Path], 1448 file1, 1449 [["fld1", "fld2_values"], 1450 ["a", "a|c"], 1451 ["c", "a|bc|bc"], 1452 ["", "bc"]] 1453 ); 1454 testSummarizer(["unittest-mis-7", "-H", "-g", "1", "--values", "3", "-r", "NA", file1Path], 1455 file1, 1456 [["fld1", "fld3_values"], 1457 ["a", "3|2b"], 1458 ["c", "2b|NA|3"], 1459 ["", "NA"]] 1460 ); 1461 testSummarizer(["unittest-mis-7-named", "-H", "-g", "fld1", "--values", "fld3", "-r", "NA", file1Path], 1462 file1, 1463 [["fld1", "fld3_values"], 1464 ["a", "3|2b"], 1465 ["c", "2b|NA|3"], 1466 ["", "NA"]] 1467 ); 1468 testSummarizer(["unittest-mis-8", "-H", "--group-by", "1", "--values", "1,2,3", "-r", "NA", file1Path], 1469 file1, 1470 [["fld1", "fld1_values", "fld2_values", "fld3_values"], 1471 ["a", "a|a", "a|c", "3|2b"], 1472 ["c", "c|c|c", "a|bc|bc", "2b|NA|3"], 1473 ["", "NA", "bc", "NA"]] 1474 ); 1475 testSummarizer(["unittest-mis-9", "-H", "--group-by", "1,2", "--values", "3,1", "-x", file1Path], 1476 file1, 1477 [["fld1", "fld2", "fld3_values", "fld1_values"], 1478 ["a", "a", "3", "a"], 1479 ["c", "a", "2b", "c"], 1480 ["c", "bc", "3", "c|c"], 1481 ["a", "c", "2b", "a"], 1482 ["", "bc", "", ""]] 1483 ); 1484 testSummarizer(["unittest-mis-10", "-H", "--group-by", "3,2", "--values", "1", "-x", file1Path], 1485 file1, 1486 [["fld3", "fld2", "fld1_values"], 1487 ["3", "a", "a"], 1488 ["2b", "a", "c"], 1489 ["", "bc", "c"], 1490 ["2b", "c", "a"], 1491 ["3", "bc", "c"]] 1492 ); 1493 testSummarizer(["unittest-mis-11", "-H", "--group-by", "2,1,3", "--values", "2", "-x", file1Path], 1494 file1, 1495 [["fld2", "fld1", "fld3", "fld2_values"], 1496 ["a", "a", "3", "a"], 1497 ["a", "c", "2b", "a"], 1498 ["bc", "c", "", "bc"], 1499 ["c", "a", "2b", "c"], 1500 ["bc", "", "", "bc"], 1501 ["bc", "c", "3", "bc"]] 1502 ); 1503 testSummarizer(["unittest-mis-12", "-H", "--group-by", "1,2", "--values", "3,1", "-r", "NA", file1Path], 1504 file1, 1505 [["fld1", "fld2", "fld3_values", "fld1_values"], 1506 ["a", "a", "3", "a"], 1507 ["c", "a", "2b", "c"], 1508 ["c", "bc", "NA|3", "c|c"], 1509 ["a", "c", "2b", "a"], 1510 ["", "bc", "NA", "NA"]] 1511 ); 1512 testSummarizer(["unittest-mis-13", "-H", "--group-by", "3,2", "--values", "1", "-r", "NA", file1Path], 1513 file1, 1514 [["fld3", "fld2", "fld1_values"], 1515 ["3", "a", "a"], 1516 ["2b", "a", "c"], 1517 ["", "bc", "c|NA"], 1518 ["2b", "c", "a"], 1519 ["3", "bc", "c"]] 1520 ); 1521 testSummarizer(["unittest-mis-14", "-H", "--group-by", "2,1,3", "--values", "2", "-r", "NA", file1Path], 1522 file1, 1523 [["fld2", "fld1", "fld3", "fld2_values"], 1524 ["a", "a", "3", "a"], 1525 ["a", "c", "2b", "a"], 1526 ["bc", "c", "", "bc"], 1527 ["c", "a", "2b", "c"], 1528 ["bc", "", "", "bc"], 1529 ["bc", "c", "3", "bc"]] 1530 ); 1531 1532 /* Validate that the no-key summarizer works with testSummarizer helper function. 1533 */ 1534 testSummarizer(["unittest-nk-1", "-H", "--values", "1,2", file1Path], 1535 file1, 1536 [["fld1_values", "fld2_values"], 1537 ["a|c|c|a||c", "a|a|bc|c|bc|bc"]] 1538 ); 1539 testSummarizer(["unittest-nk-1-named", "-H", "--values", "fld1,fld2", file1Path], 1540 file1, 1541 [["fld1_values", "fld2_values"], 1542 ["a|c|c|a||c", "a|a|bc|c|bc|bc"]] 1543 ); 1544 1545 /* Header variations: no header line; auto-generated header line; custom headers. 1546 */ 1547 testSummarizer(["unittest-hdr-1", "--group-by", "1", "--values", "1", file1NoHeaderPath], 1548 file1[1..$], 1549 [["a", "a|a"], 1550 ["c", "c|c|c"], 1551 ["", ""]] 1552 ); 1553 testSummarizer(["unittest-hdr-2", "--group-by", "1,2", "--values", "2", file1NoHeaderPath], 1554 file1[1..$], 1555 [["a", "a", "a"], 1556 ["c", "a", "a"], 1557 ["c", "bc", "bc|bc"], 1558 ["a", "c", "c"], 1559 ["", "bc", "bc"]] 1560 ); 1561 testSummarizer(["unittest-hdr-3", "--write-header", "--group-by", "2", "--values", "1", file1NoHeaderPath], 1562 file1[1..$], 1563 [["field2", "field1_values"], 1564 ["a", "a|c"], 1565 ["bc", "c||c"], 1566 ["c", "a"]] 1567 ); 1568 testSummarizer(["unittest-hdr-4", "-w", "--group-by", "3,2", "--values", "1", file1NoHeaderPath], 1569 file1[1..$], 1570 [["field3", "field2", "field1_values"], 1571 ["3", "a", "a"], 1572 ["2b", "a", "c"], 1573 ["", "bc", "c|"], 1574 ["2b", "c", "a"], 1575 ["3", "bc", "c"]] 1576 ); 1577 testSummarizer(["unittest-hdr-5", "-H", "--group-by", "2", "--values", "3:Field3Values", file1Path], 1578 file1, 1579 [["fld2", "Field3Values"], 1580 ["a", "3|2b"], 1581 ["bc", "||3"], 1582 ["c", "2b"]] 1583 ); 1584 testSummarizer(["unittest-hdr-6", "-H", "--group-by", "1,2", "--values", "3:FieldThreeValues", "--values", "1:FieldOneValues", file1Path], 1585 file1, 1586 [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"], 1587 ["a", "a", "3", "a"], 1588 ["c", "a", "2b", "c"], 1589 ["c", "bc", "|3", "c|c"], 1590 ["a", "c", "2b", "a"], 1591 ["", "bc", "", ""]] 1592 ); 1593 testSummarizer(["unittest-hdr-6-named-a", "-H", "--group-by", "fld1,fld2", "--values", "fld3:FieldThreeValues", "--values", "fld1:FieldOneValues", file1Path], 1594 file1, 1595 [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"], 1596 ["a", "a", "3", "a"], 1597 ["c", "a", "2b", "c"], 1598 ["c", "bc", "|3", "c|c"], 1599 ["a", "c", "2b", "a"], 1600 ["", "bc", "", ""]] 1601 ); 1602 testSummarizer(["unittest-hdr-6-named-b", "-H", "--group-by", "fld1,fld2", "--values", "fld3 FieldThreeValues", "--values", "fld1 FieldOneValues", file1Path], 1603 file1, 1604 [["fld1", "fld2", "FieldThreeValues", "FieldOneValues"], 1605 ["a", "a", "3", "a"], 1606 ["c", "a", "2b", "c"], 1607 ["c", "bc", "|3", "c|c"], 1608 ["a", "c", "2b", "a"], 1609 ["", "bc", "", ""]] 1610 ); 1611 testSummarizer(["unittest-hdr-7", "--write-header", "--group-by", "1", "--values", "3:f3_vals","--values", "2:f2_vals", "--values", "1:f1_vals", file1NoHeaderPath], 1612 file1[1..$], 1613 [["field1", "f3_vals", "f2_vals", "f1_vals"], 1614 ["a", "3|2b", "a|c", "a|a"], 1615 ["c", "2b||3", "a|bc|bc", "c|c|c"], 1616 ["", "", "bc", ""]] 1617 ); 1618 testSummarizer(["unittest-hdr-8", "--write-header", "--group-by", "1,3,2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2", file1NoHeaderPath], 1619 file1[1..$], 1620 [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"], 1621 ["a", "3", "a", "3", "a", "a"], 1622 ["c", "2b", "a", "2b", "c", "a"], 1623 ["c", "", "bc", "", "c", "bc"], 1624 ["a", "2b", "c", "2b", "a", "c"], 1625 ["", "", "bc", "", "", "bc"], 1626 ["c", "3", "bc", "3", "c", "bc"]] 1627 ); 1628 testSummarizer(["unittest-hdr-9", "--write-header", "--group-by", "1,3-2", "--values", "3", "--values", "1:ValsField1", "--values", "2:ValsField2", file1NoHeaderPath], 1629 file1[1..$], 1630 [["field1", "field3", "field2", "field3_values", "ValsField1", "ValsField2"], 1631 ["a", "3", "a", "3", "a", "a"], 1632 ["c", "2b", "a", "2b", "c", "a"], 1633 ["c", "", "bc", "", "c", "bc"], 1634 ["a", "2b", "c", "2b", "a", "c"], 1635 ["", "", "bc", "", "", "bc"], 1636 ["c", "3", "bc", "3", "c", "bc"]] 1637 ); 1638 1639 /* Alternate file widths and lengths. 1640 */ 1641 1642 auto file3x2 = [["fld1", "fld2", "fld3"], 1643 ["a", "b", "c"], 1644 ["c", "b", "a"]]; 1645 1646 auto file3x2Path = buildPath(testDir, "file3x2.tsv"); 1647 auto file3x2NoHeaderPath = buildPath(testDir, "file3x2_noheader.tsv"); 1648 writeDataFile(file3x2Path, file3x2); 1649 writeDataFile(file3x2NoHeaderPath, file3x2[1 .. $]); 1650 1651 testSummarizer(["unittest-3x2-1", "-H", "--group-by", "1", "--values", "3", file3x2Path], 1652 file3x2, 1653 [["fld1", "fld3_values"], 1654 ["a", "c"], 1655 ["c", "a"]] 1656 ); 1657 testSummarizer(["unittest-3x2-2", "-H", "--group-by", "2", "--values", "3", file3x2Path], 1658 file3x2, 1659 [["fld2", "fld3_values"], 1660 ["b", "c|a"]] 1661 ); 1662 testSummarizer(["unittest-3x2-3", "-H", "--group-by", "2,1", "--values", "3", file3x2Path], 1663 file3x2, 1664 [["fld2", "fld1", "fld3_values"], 1665 ["b", "a", "c"], 1666 ["b", "c", "a"]] 1667 ); 1668 1669 auto file3x1 = [["fld1", "fld2", "fld3"], 1670 ["a", "b", "c"]]; 1671 1672 auto file3x1Path = buildPath(testDir, "file3x1.tsv"); 1673 auto file3x1NoHeaderPath = buildPath(testDir, "file3x1_noheader.tsv"); 1674 writeDataFile(file3x1Path, file3x1); 1675 writeDataFile(file3x1NoHeaderPath, file3x1[1 .. $]); 1676 1677 testSummarizer(["unittest-3x1-1", "-H", "--group-by", "1", "--values", "3", file3x1Path], 1678 file3x1, 1679 [["fld1", "fld3_values"], 1680 ["a", "c"]] 1681 ); 1682 testSummarizer(["unittest-3x1-2", "--group-by", "1", "--values", "3", file3x1NoHeaderPath], 1683 file3x1[1..$], 1684 [["a", "c"]] 1685 ); 1686 testSummarizer(["unittest-3x1-3", "-H", "--group-by", "2,1", "--values", "3", file3x1Path], 1687 file3x1, 1688 [["fld2", "fld1", "fld3_values"], 1689 ["b", "a", "c"]] 1690 ); 1691 testSummarizer(["unittest-3x1-3-named", "-H", "--group-by", "fld2,fld1", "--values", "fld3", file3x1Path], 1692 file3x1, 1693 [["fld2", "fld1", "fld3_values"], 1694 ["b", "a", "c"]] 1695 ); 1696 testSummarizer(["unittest-3x1-4", "--group-by", "2,1", "--values", "3", file3x1NoHeaderPath], 1697 file3x1[1..$], 1698 [["b", "a", "c"]] 1699 ); 1700 1701 auto file3x0 = [["fld1", "fld2", "fld3"]]; 1702 1703 auto file3x0Path = buildPath(testDir, "file3x0.tsv"); 1704 auto file3x0NoHeaderPath = buildPath(testDir, "file3x0_noheader.tsv"); 1705 writeDataFile(file3x0Path, file3x0); 1706 writeDataFile(file3x0NoHeaderPath, file3x0[1 .. $]); 1707 1708 1709 testSummarizer(["unittest-3x0-1", "-H", "--group-by", "1", "--values", "3", file3x0Path], 1710 file3x0, 1711 [["fld1", "fld3_values"]] 1712 ); 1713 testSummarizer(["unittest-3x0-1-named", "-H", "--group-by", "fld1", "--values", "fld3", file3x0Path], 1714 file3x0, 1715 [["fld1", "fld3_values"]] 1716 ); 1717 testSummarizer(["unittest-3x0-2", "--group-by", "1", "--values", "3", file3x0NoHeaderPath], 1718 file3x0[1..$], 1719 [] 1720 ); 1721 testSummarizer(["unittest-3x0-3", "--write-header", "--group-by", "1", "--values", "3", file3x0NoHeaderPath], 1722 file3x0[1..$], 1723 [["field1", "field3_values"]] 1724 ); 1725 1726 1727 testSummarizer(["unittest-3x0-4", "-H", "--group-by", "2,1", "--values", "3", file3x0Path], 1728 file3x0, 1729 [["fld2", "fld1", "fld3_values"]] 1730 ); 1731 1732 testSummarizer(["unittest-3x0-5", "--group-by", "2,1", "--values", "3", file3x0NoHeaderPath], 1733 file3x0[1..$], 1734 [] 1735 ); 1736 1737 testSummarizer(["unittest-3x0-6", "--write-header", "--group-by", "2,1", "--values", "3", file3x0NoHeaderPath], 1738 file3x0[1..$], 1739 [["field2", "field1", "field3_values"]] 1740 ); 1741 1742 auto file2x1 = [["fld1", "fld2"], 1743 ["a", "b"]]; 1744 1745 auto file2x1Path = buildPath(testDir, "file2x1.tsv"); 1746 auto file2x1NoHeaderPath = buildPath(testDir, "file2x1_noheader.tsv"); 1747 writeDataFile(file2x1Path, file2x1); 1748 writeDataFile(file2x1NoHeaderPath, file2x1[1 .. $]); 1749 1750 testSummarizer(["unittest-2x1-1", "-H", "--group-by", "1", "--values", "2", file2x1Path], 1751 file2x1, 1752 [["fld1", "fld2_values"], 1753 ["a", "b"]] 1754 ); 1755 testSummarizer(["unittest-2x1-2", "-H", "--group-by", "2,1", "--values", "1", file2x1Path], 1756 file2x1, 1757 [["fld2", "fld1", "fld1_values"], 1758 ["b", "a", "a"]] 1759 ); 1760 1761 auto file2x0 = [["fld1", "fld2"]]; 1762 1763 auto file2x0Path = buildPath(testDir, "file2x0.tsv"); 1764 auto file2x0NoHeaderPath = buildPath(testDir, "file2x0_noheader.tsv"); 1765 writeDataFile(file2x0Path, file2x0); 1766 writeDataFile(file2x0NoHeaderPath, file2x0[1 .. $]); 1767 1768 testSummarizer(["unittest-2x0-1", "-H", "--group-by", "1", "--values", "2", file2x0Path], 1769 file2x0, 1770 [["fld1", "fld2_values"]] 1771 ); 1772 testSummarizer(["unittest-2x0-2", "-H", "--group-by", "2,1", "--values", "1", file2x0Path], 1773 file2x0, 1774 [["fld2", "fld1", "fld1_values"]] 1775 ); 1776 1777 auto file1x2 = [["fld1"], 1778 ["a"], 1779 [""]]; 1780 1781 auto file1x2Path = buildPath(testDir, "file1x2.tsv"); 1782 auto file1x2NoHeaderPath = buildPath(testDir, "file1x2_noheader.tsv"); 1783 writeDataFile(file1x2Path, file1x2); 1784 writeDataFile(file1x2NoHeaderPath, file1x2[1 .. $]); 1785 1786 testSummarizer(["unittest-1x2-1", "-H", "--group-by", "1", "--values", "1", file1x2Path], 1787 file1x2, 1788 [["fld1", "fld1_values"], 1789 ["a", "a"], 1790 ["", ""]] 1791 ); 1792 1793 auto file1x2b = [["fld1"], 1794 [""], 1795 [""]]; 1796 1797 auto file1x2bPath = buildPath(testDir, "file1x2b.tsv"); 1798 auto file1x2bNoHeaderPath = buildPath(testDir, "file1x2b_noheader.tsv"); 1799 writeDataFile(file1x2bPath, file1x2b); 1800 writeDataFile(file1x2bNoHeaderPath, file1x2b[1 .. $]); 1801 1802 testSummarizer(["unittest-1x2b-2", "-H", "--group-by", "1", "--values", "1", file1x2bPath], 1803 file1x2b, 1804 [["fld1", "fld1_values"], 1805 ["", "|"]] 1806 ); 1807 1808 auto file1x1 = [["fld1"], 1809 ["x"]]; 1810 1811 auto file1x1Path = buildPath(testDir, "file1x1.tsv"); 1812 auto file1x1NoHeaderPath = buildPath(testDir, "file1x1_noheader.tsv"); 1813 writeDataFile(file1x1Path, file1x1); 1814 writeDataFile(file1x1NoHeaderPath, file1x1[1 .. $]); 1815 1816 testSummarizer(["unittest-1x1-1", "-H", "--group-by", "1", "--values", "1", file1x1Path], 1817 file1x1, 1818 [["fld1", "fld1_values"], 1819 ["x", "x"]] 1820 ); 1821 testSummarizer(["unittest-1x1-1-named", "-H", "--group-by", "fld1", "--values", "fld1", file1x1Path], 1822 file1x1, 1823 [["fld1", "fld1_values"], 1824 ["x", "x"]] 1825 ); 1826 1827 testSummarizer(["unittest-1x1-2", "--group-by", "1", "--values", "1", file1x1NoHeaderPath], 1828 file1x1[1..$], 1829 [["x", "x"]] 1830 ); 1831 1832 testSummarizer(["unittest-1x1-3", "--write-header", "--group-by", "1", "--values", "1", file1x1NoHeaderPath], 1833 file1x1[1..$], 1834 [["field1", "field1_values"], 1835 ["x", "x"]] 1836 ); 1837 1838 auto file1x1b = [["fld1"], 1839 [""]]; 1840 1841 auto file1x1bPath = buildPath(testDir, "file1x1b.tsv"); 1842 auto file1x1bNoHeaderPath = buildPath(testDir, "file1x1b_noheader.tsv"); 1843 writeDataFile(file1x1bPath, file1x1b); 1844 writeDataFile(file1x1bNoHeaderPath, file1x1b[1 .. $]); 1845 1846 testSummarizer(["unittest-1x1b-1", "-H", "--group-by", "1", "--values", "1", file1x1bPath], 1847 file1x1b, 1848 [["fld1", "fld1_values"], 1849 ["", ""]] 1850 ); 1851 1852 auto file1x0 = [["fld1"]]; 1853 1854 auto file1x0Path = buildPath(testDir, "file1x0.tsv"); 1855 auto file1x0NoHeaderPath = buildPath(testDir, "file1x0_noheader.tsv"); 1856 writeDataFile(file1x0Path, file1x0); 1857 writeDataFile(file1x0NoHeaderPath, file1x0[1 .. $]); 1858 1859 testSummarizer(["unittest-1x0-1", "-H", "--group-by", "1", "--values", "1", file1x0Path], 1860 file1x0, 1861 [["fld1", "fld1_values"]] 1862 ); 1863 1864 testSummarizer(["unittest-1x0-2", "--group-by", "1", "--values", "1", file1x0NoHeaderPath], 1865 file1x0[1..$], 1866 [] 1867 ); 1868 1869 testSummarizer(["unittest-1x0-3", "--write-header", "--group-by", "1", "--values", "1", file1x0NoHeaderPath], 1870 file1x0[1..$], 1871 [["field1", "field1_values"]] 1872 ); 1873 1874 /* Alternate delimiters. 1875 * 1876 * Note: In current unit test setup the data is already in memory (file1). 1877 * 'file1Path' points to a file with equivalent data, but not read, except if 1878 * processing the header line. A data file is created for the '%' and '#' 1879 * delimiter cases (these read the header), but we don't bother for the others. 1880 */ 1881 auto file1PctDelimPath = buildPath(testDir, "file1PctDelim.tsv"); 1882 auto file1HashDelimPath = buildPath(testDir, "file1HashDelim.tsv"); 1883 writeDataFile(file1PctDelimPath, file1, "%"); 1884 writeDataFile(file1HashDelimPath, file1, "#"); 1885 1886 testSummarizer(["unittest-delim-1", "-H", "--values", "1,2", "--delimiter", "%", file1PctDelimPath], 1887 file1, 1888 [["fld1_values", "fld2_values"], 1889 ["a|c|c|a||c", "a|a|bc|c|bc|bc"]] 1890 ); 1891 testSummarizer(["unittest-delim-1-named", "-H", "--values", "fld1,fld2", "--delimiter", "%", file1PctDelimPath], 1892 file1, 1893 [["fld1_values", "fld2_values"], 1894 ["a|c|c|a||c", "a|a|bc|c|bc|bc"]] 1895 ); 1896 testSummarizer(["unittest-delim-2", "-H", "--values", "1-2", "--values-delimiter", "$", file1Path], 1897 file1, 1898 [["fld1_values", "fld2_values"], 1899 ["a$c$c$a$$c", "a$a$bc$c$bc$bc"]] 1900 ); 1901 testSummarizer(["unittest-delim-3", "-H", "--values", "1,2", "--delimiter", "#", "--values-delimiter", ",", file1HashDelimPath], 1902 file1, 1903 [["fld1_values", "fld2_values"], 1904 ["a,c,c,a,,c", "a,a,bc,c,bc,bc"]] 1905 ); 1906 testSummarizer(["unittest-delim-3-named", "-H", "--values", "fld1,fld2", "--delimiter", "#", "--values-delimiter", ",", file1HashDelimPath], 1907 file1, 1908 [["fld1_values", "fld2_values"], 1909 ["a,c,c,a,,c", "a,a,bc,c,bc,bc"]] 1910 ); 1911 testSummarizer(["unittest-delim-4", "--write-header", "--group-by", "2", "--values", "1", 1912 "--delimiter", "^", "--values-delimiter", ":", file1NoHeaderPath], 1913 file1[1..$], 1914 [["field2", "field1_values"], 1915 ["a", "a:c"], 1916 ["bc", "c::c"], 1917 ["c", "a"]] 1918 ); 1919 testSummarizer(["unittest-delim-5", "--group-by", "1,2", "--values", "2", "--delimiter", "/", 1920 "--values-delimiter", "\\", file1NoHeaderPath], 1921 file1[1..$], 1922 [["a", "a", "a"], 1923 ["c", "a", "a"], 1924 ["c", "bc", "bc\\bc"], 1925 ["a", "c", "c"], 1926 ["", "bc", "bc"]] 1927 ); 1928 } 1929 1930 /* Summary Operators and Calculators 1931 * 1932 * Two types of objects are used in implementation: Operators and Calculators. An Operator 1933 * represents a summary calculation specified on the command line, e.g. '--mean 5'. A 1934 * Calculator is used to manage the summary calculation for each unique key in the input. 1935 * 1936 * As an example, consider the command: 1937 * 1938 * $tsv-summarize --group-by 1 --mean 3 --mean 5 1939 * 1940 * This command will create two instances of a MeanOperator, one each for fields 3 and 5. 1941 * They produce the output field headers (e.g. "field3_mean", "field5_mean"). They also 1942 * create MeanCalculator objects for each unique value in field 1. For 'mean', a 1943 * calculator needs to track occurrence count and sum. Calculators produce the final 1944 * value when all processing is finished. 1945 * 1946 * Summary field headers 1947 * 1948 * There are several options for specifying summary field headers. The defaults combine the 1949 * operator name and the header of the field summarized. The defaults can be overridden on 1950 * on the command line. These scenarios are supported via the operator constructor and the 1951 * processHeaderLine() method. 1952 * 1953 * Missing field policy 1954 * 1955 * At present, tsv-summarize has a single policy for handling missing values that applies 1956 * to all operators. However, it is logically operator specific and is implemented that 1957 * way. The MissingFieldPolicy struct describes the policy, each operator contains one. 1958 * Calculators access thier operator's policy struct. 1959 */ 1960 1961 /** An Operator represents a summary calculation specified on the command line. 1962 * e.g. '--mean 5'. 1963 */ 1964 interface Operator 1965 { 1966 @property string header(); 1967 @property string name(); 1968 void processHeaderLine(const char[][] fields); 1969 size_t[] numericFieldsToSave(); // Numeric fields this Operator needs saved 1970 size_t[] textFieldsToSave(); // Text fields this Operator needs saved 1971 Calculator makeCalculator(); 1972 } 1973 1974 /** Calculators are responsible for the calculation of a single computation. They 1975 * process each line and produce the final value when all processing is finished. 1976 */ 1977 interface Calculator 1978 { 1979 void processNextLine(const char[][] fields); 1980 string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions); 1981 } 1982 1983 /** This class describes processing behavior when a missing value is encountered. 1984 */ 1985 final class MissingFieldPolicy 1986 { 1987 private bool _useMissing = true; // True if missing values are processed unchanged. 1988 private bool _replaceMissing = false; // True if missing values are replaced. 1989 private string _missingReplacement; // Replacement string if replaceMissing is true. 1990 1991 this (const bool excludeMissing = false, string missingReplacement = "") 1992 { 1993 updatePolicy(excludeMissing, missingReplacement); 1994 } 1995 1996 void updatePolicy(const bool excludeMissing, string missingReplacement) 1997 { 1998 _missingReplacement = missingReplacement; 1999 _replaceMissing = missingReplacement.length != 0; 2000 _useMissing = !excludeMissing && !replaceMissing; 2001 } 2002 2003 final bool isMissingField(const char[] field) const 2004 { 2005 return field.length == 0; 2006 } 2007 2008 final bool useMissing() const @property 2009 { 2010 return _useMissing; 2011 } 2012 2013 final bool excludeMissing() const @property 2014 { 2015 return !_useMissing && !_replaceMissing; 2016 } 2017 2018 final bool replaceMissing() const @property 2019 { 2020 return _replaceMissing; 2021 } 2022 2023 final string missingReplacement() const @property 2024 { 2025 return _missingReplacement; 2026 } 2027 } 2028 2029 /* The SharedFieldValues and UniqueKeyValuesLists classes manage lists of values collected 2030 * while reading data. Operations like median collect all values and operate on them when 2031 * running the final calculation. Value lists are needed for each unique key. A command 2032 * using multiple Operators may save multiple fields. And, different Operators may be run 2033 * against the same field. 2034 * 2035 * The last part motivates these classes. Handling large data sets necessitates minimizing 2036 * in-memory storage, making it desirable to share identical lists between Calculators. 2037 * Otherwise, each Calculator could implement its own storage, which would be simpler. 2038 * 2039 * The setup works as follows: 2040 * - Operators advertise fields they need saved ([text|numeric]FieldsToSave methods). 2041 * - The SummarizerBase object keeps a SharedFieldValues object, which in turn keeps list 2042 * of the fields advertised by Operators as needing sharing. This list gets created 2043 * during command initialization (SummarizerBase.setOperators). 2044 * - The SharedFieldValues object is used to create a UniqueKeyValuesLists object every 2045 * time a new unique key is found, in parellel to the Calculator objects created for the 2046 * key. The UniqueKeyValuesLists objects are managed by derived Summarizer classes. 2047 * - A unique key's UniqueKeyValuesLists object is passed each input line, same as 2048 * Calculators, saving the values. 2049 * - Calculators retrieve the saved values during the calculation phase. The calculator's 2050 * ProcessNextField method is typically a no-op. 2051 * - Calculators cannot make assumptions about the order of the saved values. This is 2052 * pragmatic concession to median and quantile calculations, which need to sort the data, 2053 * at least partially. Rather than generate sorted copies, the current algorithms 2054 * sort the data in place. 2055 * 2056 * One concession to duplicate storage is that text and numeric versions of the same 2057 * field might be stored. The reason is because it's important to convert text to numbers 2058 * as they are read so that useful error messages can be generated. And, storing both 2059 * forms of the same field should be less common. 2060 * 2061 * The current implementation uses the same missing values policy for all fields. If 2062 * multiple policies become supported this will need to change. 2063 * 2064 * Built-in calculations - UniqueKeyValueLists have a built-in median operation. This is 2065 * to avoid repeated calculations of the median by different calculations. 2066 */ 2067 2068 final class SharedFieldValues 2069 { 2070 // Arrays with field indices that need to be saved. 2071 private size_t[] _numericFieldIndices; 2072 private size_t[] _textFieldIndices; 2073 2074 /* Called during summarizer setup to add a shared field value for a specific field index. 2075 * eg. '--median 7' will add end up calling addNumericIdex(6), 6 being the zero-based index. 2076 * A specific index is only added once. 2077 */ 2078 final void addNumericIndex (size_t index) 2079 { 2080 if (!canFind(_numericFieldIndices, index)) _numericFieldIndices ~= index; 2081 } 2082 2083 /* Similar to addNumericIndex, except adds a text index. */ 2084 final void addTextIndex (size_t index) 2085 { 2086 if (!canFind(_textFieldIndices, index)) _textFieldIndices ~= index; 2087 } 2088 2089 /* Called every time a new key is found, or once at the beginning of the program if no keys 2090 * are being used (entire column summarized). 2091 */ 2092 final UniqueKeyValuesLists makeUniqueKeyValuesLists() 2093 { 2094 return new UniqueKeyValuesLists(_numericFieldIndices, _textFieldIndices); 2095 } 2096 } 2097 2098 final class UniqueKeyValuesLists 2099 { 2100 /* A FieldValues object holds is a list of values collect for a specific field. A 2101 * unique key may hold several. For example, the command: 2102 * $ tsv-summarize --k 1 --median 4 -- median 5 2103 * requires keeping lists for both fields 4 and 5. This in turn will result in a 2104 * _numericFieldValues being a 2 element array, one with a list of field 4 values, 2105 * the second of field 5 values. Linear search is used to find a specific field. 2106 */ 2107 private FieldValues!double[] _numericFieldValues; 2108 private FieldValues!string[] _textFieldValues; 2109 private double[] _numericFieldMedians; 2110 2111 /* The UniqueKeyValuesLists constructor takes arrays of field indices to be saved. */ 2112 this(const size_t[] numericFieldIndices, const size_t[] textFieldIndices) 2113 { 2114 if (numericFieldIndices.length > 0) 2115 { 2116 _numericFieldValues = new FieldValues!double[](numericFieldIndices.length); 2117 foreach (i, fieldIndex; numericFieldIndices) 2118 _numericFieldValues[i] = new FieldValues!double(fieldIndex); 2119 } 2120 2121 if (textFieldIndices.length > 0) 2122 { 2123 _textFieldValues = new FieldValues!string[](textFieldIndices.length); 2124 foreach (i, fieldIndex; textFieldIndices) 2125 _textFieldValues[i] = new FieldValues!string(fieldIndex); 2126 } 2127 } 2128 2129 void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy) 2130 { 2131 _numericFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy)); 2132 _textFieldValues.each!((ref x) => x.processNextLine(fields, missingPolicy)); 2133 } 2134 2135 private FieldValues!double findNumericFieldValues(size_t index) 2136 { 2137 alias pred = (FieldValues!double a, size_t b) => (a.fieldIndex == b); 2138 auto r = find!pred(_numericFieldValues, index); 2139 assert(!r.empty); 2140 return r.front; 2141 } 2142 2143 private FieldValues!string findTextFieldValues(size_t index) 2144 { 2145 alias pred = (FieldValues!string a, size_t b) => (a.fieldIndex == b); 2146 auto r = find!pred(_textFieldValues, index); 2147 assert(!r.empty); 2148 return r.front; 2149 } 2150 2151 final double[] numericValues(size_t index) 2152 { 2153 return findNumericFieldValues(index).getArray; 2154 } 2155 2156 final double[] numericValuesSorted(size_t index) 2157 { 2158 return findNumericFieldValues(index).getSortedArray; 2159 } 2160 2161 final string[] textValues(size_t index) 2162 { 2163 return findTextFieldValues(index).getArray; 2164 } 2165 2166 final string[] textValuesSorted(size_t index) 2167 { 2168 return findTextFieldValues(index).getSortedArray; 2169 } 2170 2171 final double numericValuesMedian(size_t index) 2172 { 2173 return findNumericFieldValues(index).median; 2174 } 2175 2176 private final class FieldValues(ValueType) 2177 { 2178 import std.array : appender; 2179 private size_t _fieldIndex; 2180 private Appender!(ValueType[]) _values; 2181 private bool _haveMedian = false; 2182 private bool _isSorted = false; 2183 private ValueType _medianValue; 2184 2185 this(size_t fieldIndex) 2186 { 2187 _fieldIndex = fieldIndex; 2188 } 2189 2190 final size_t length() const @property 2191 { 2192 return _values.data.length; 2193 } 2194 2195 final size_t fieldIndex() const @property 2196 { 2197 return _fieldIndex; 2198 } 2199 2200 final void processNextLine(const char[][] fields, MissingFieldPolicy missingPolicy) 2201 { 2202 debug writefln("[%s]: %s", __FUNCTION__, fields.to!string); 2203 2204 const char[] field = fields[_fieldIndex]; 2205 if (missingPolicy.useMissing || !missingPolicy.isMissingField(field)) 2206 { 2207 _values.put(field.to!ValueType); 2208 _haveMedian = false; 2209 _isSorted = false; 2210 } 2211 else if (missingPolicy.replaceMissing) 2212 { 2213 _values.put(missingPolicy.missingReplacement.to!ValueType); 2214 _haveMedian = false; 2215 _isSorted = false; 2216 } 2217 } 2218 2219 /* Return an input range of the values. */ 2220 final auto values() 2221 { 2222 return _values.data; 2223 } 2224 2225 final ValueType[] getArray() 2226 { 2227 return _values.data; 2228 } 2229 2230 final ValueType[] getSortedArray() 2231 { 2232 if (!_isSorted) 2233 { 2234 import std.algorithm : sort; 2235 sort(_values.data); 2236 _isSorted = true; 2237 } 2238 return _values.data; 2239 } 2240 2241 final ValueType median() 2242 { 2243 if (!_haveMedian) 2244 { 2245 import tsv_utils.common.numerics : rangeMedian; 2246 _medianValue = _values.data.rangeMedian(); 2247 _haveMedian = true; 2248 } 2249 2250 return _medianValue; 2251 } 2252 } 2253 } 2254 2255 /** SingleFieldOperator is a base class for single field operators, the most common 2256 * Operator. Derived classes implement makeCalculator and the Calculator class it returns. 2257 */ 2258 class SingleFieldOperator : Operator 2259 { 2260 import std.typecons : Flag; 2261 2262 private string _name; 2263 private string _header; 2264 private size_t _fieldIndex; 2265 private bool _useHeaderSuffix; 2266 private bool _allowCustomHeader; 2267 private bool _hasCustomHeader = false; 2268 private size_t[] _numericFieldsToSave; 2269 private size_t[] _textFieldsToSave; 2270 private MissingFieldPolicy _missingPolicy; 2271 2272 this(string operatorName, size_t fieldIndex, MissingFieldPolicy missingPolicy, 2273 Flag!"useHeaderSuffix" useHeaderSuffix = Yes.useHeaderSuffix, 2274 Flag!"allowCustomHeader" allowCustomHeader = Yes.allowCustomHeader) 2275 { 2276 _name = operatorName; 2277 _fieldIndex = fieldIndex; 2278 _missingPolicy = missingPolicy; 2279 _useHeaderSuffix = useHeaderSuffix; 2280 _allowCustomHeader = allowCustomHeader; 2281 // Default header. May be overrridden by custom header or header line. 2282 _header = 2283 fieldHeaderFromIndex(fieldIndex) 2284 .summaryHeaderFromFieldHeader(_useHeaderSuffix ? operatorName : ""); 2285 } 2286 2287 void setCustomHeader (string customHeader) 2288 { 2289 assert(_allowCustomHeader); 2290 _header = customHeader; 2291 _hasCustomHeader = true; 2292 } 2293 2294 final string name() const @property 2295 { 2296 return _name; 2297 } 2298 2299 final bool allowCustomHeader() const @property 2300 { 2301 return _allowCustomHeader; 2302 } 2303 2304 /* saveFieldValues[Numeric|Text] are called by derived classes to indicate that field 2305 * that the field values should be saved. These should called during construction. 2306 */ 2307 final void setSaveFieldValuesNumeric() 2308 { 2309 _numericFieldsToSave ~= _fieldIndex; 2310 } 2311 2312 final void setSaveFieldValuesText() 2313 { 2314 _textFieldsToSave ~= _fieldIndex; 2315 } 2316 2317 final MissingFieldPolicy missingPolicy() @property 2318 { 2319 return _missingPolicy; 2320 } 2321 2322 final size_t fieldIndex() const @property 2323 { 2324 return _fieldIndex; 2325 } 2326 2327 final string header() const @property 2328 { 2329 return _header; 2330 } 2331 2332 final bool useHeaderSuffix() const @property 2333 { 2334 return _useHeaderSuffix; 2335 } 2336 2337 void processHeaderLine(const char[][] fields) 2338 { 2339 if (!_hasCustomHeader) { 2340 debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string); 2341 _header = summaryHeaderFromFieldHeader(fields[_fieldIndex].to!string, 2342 _useHeaderSuffix ? _name : ""); 2343 } 2344 } 2345 2346 final size_t[] numericFieldsToSave() 2347 { 2348 return _numericFieldsToSave; 2349 } 2350 2351 final size_t[] textFieldsToSave() 2352 { 2353 return _textFieldsToSave; 2354 } 2355 2356 abstract SingleFieldCalculator makeCalculator(); 2357 } 2358 2359 /** SingleFieldCalculator is a base class for the common case of calculators using a single 2360 * field. Derived classes implement processNextField() rather than processNextLine(). 2361 */ 2362 class SingleFieldCalculator : Calculator 2363 { 2364 private size_t _fieldIndex; 2365 2366 this(size_t fieldIndex) 2367 { 2368 _fieldIndex = fieldIndex; 2369 } 2370 2371 final size_t fieldIndex() const @property 2372 { 2373 return _fieldIndex; 2374 } 2375 2376 final void processNextLine(const char[][] fields) 2377 { 2378 debug writefln("[%s %d] fields: %s", __FUNCTION__, _fieldIndex, fields.to!string); 2379 2380 auto missingPolicy = getOperator.missingPolicy; 2381 const char[] field = fields[_fieldIndex]; 2382 2383 if (missingPolicy.useMissing || !missingPolicy.isMissingField(field)) 2384 { 2385 processNextField(field); 2386 } 2387 else if (missingPolicy.replaceMissing) 2388 { 2389 processNextField(missingPolicy.missingReplacement); 2390 } 2391 } 2392 2393 abstract SingleFieldOperator getOperator(); 2394 2395 abstract void processNextField(const char[] field); 2396 } 2397 2398 /* Unittest helper functions. Only compiled when -unittest is in effect. */ 2399 version(unittest) 2400 { 2401 /** A helper for SingleFieldOperator unit tests. 2402 * 2403 * testSingleFieldOperator takes a set of split file values, a field index, a header 2404 * suffix, and a set of expected values. The expected values array contains the 2405 * initial value (zero entries) and the expected values after each line. (One more 2406 * expected value than input lines.) The zero entry case is what is generated for an 2407 * empty file. An example testing the 'min' operator against a file with 2 columns, 2408 * 3 rows, using field index 1: 2409 * 2410 * testSingleFieldOperator!MinOperator( 2411 * [["10", "100"], // The split file. 3 lines by 2 rows. 2412 * ["5", "50"], 2413 * ["20", "200"]], 2414 * 1, // Field index (zero-based, so "100", "50", "200") 2415 * "min", // The header suffix, normally the operator name. 2416 * ["nan", "100", "50", "50"]); // Min value after processing each line. 2417 * 2418 * A typical operator unit test uses three "files", one each of 1x3, 2x3, and 3x3. 2419 * Then run the operator is tested against each column, a total of six calls. Headers 2420 * are automatically checked. Additional entries can be used to extend coverage. 2421 * 2422 * A non-default MissingFieldPolicy can be provide as an optional last argument. 2423 * Operator tests should include exclusion and replacement variations. See operator 2424 * unit tests for details. 2425 * 2426 * The testSingleFieldOperatorBase adds an additional capability - Custom operator 2427 * init arguments. Currently this is used only by the quantile operator. 2428 * 2429 * These tests do not check unique key behavior (group-by). Operators don't have info 2430 * about unique keys, and interact with them only indirectly, via Calculators. 2431 */ 2432 void testSingleFieldOperator(OperatorClass : SingleFieldOperator) 2433 (const char[][][] splitFile, size_t fieldIndex, string headerSuffix, 2434 const char[][] expectedValues, 2435 MissingFieldPolicy missingPolicy = new MissingFieldPolicy) 2436 { 2437 testSingleFieldOperatorBase!OperatorClass(splitFile, fieldIndex, headerSuffix, expectedValues, missingPolicy); 2438 } 2439 2440 void testSingleFieldOperatorBase(OperatorClass : SingleFieldOperator, T...) 2441 (const char[][][] splitFile, size_t fieldIndex, string headerSuffix, 2442 const char[][] expectedValues, 2443 MissingFieldPolicy missingPolicy, 2444 T extraOpInitArgs) 2445 { 2446 import std.format : format; 2447 import std.array : appender; 2448 import std.string : chomp; 2449 import std.traits : EnumMembers; 2450 2451 auto numFields = (splitFile[0]).length; 2452 2453 assert(fieldIndex < numFields, 2454 format("[testSingleFieldOperator] Invalid field index. headerSuffix: %s", 2455 headerSuffix)); 2456 assert(splitFile.length + 1 == expectedValues.length, 2457 format("[testSingleFieldOperator] Need one more expected value than number of rows. headerSuffix: %s", 2458 headerSuffix)); 2459 2460 /* printOptions - Only the 'values-delimiter' (2nd arg) is used these tests. */ 2461 auto printOptions = SummarizerPrintOptions('#', '|'); 2462 2463 /* An input header line. */ 2464 string[] inputHeaderLine = new string[numFields]; 2465 foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string; 2466 2467 /* The different expected output field headers. */ 2468 auto outputFieldHeaderWithNoHeaderLine = 2469 fieldHeaderFromIndex(fieldIndex) 2470 .summaryHeaderFromFieldHeader(headerSuffix); 2471 auto outputFieldHeaderFromHeaderLine = 2472 inputHeaderLine[fieldIndex] 2473 .summaryHeaderFromFieldHeader(headerSuffix); 2474 auto customOutputFieldHeader = "custom"; 2475 2476 enum HeaderUsecase { 2477 HeaderLine_DefaultHeader, 2478 HeaderLine_CustomHeader, 2479 NoHeaderLine_DefaultHeader, 2480 NoHeaderLine_CustomHeader, 2481 NoHeaderLine_NoOutputHeader, 2482 } 2483 2484 string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected) 2485 { 2486 return format("[testSingleFieldOperator] Unexpected header. Operator: %s; Usecase: %s; Actual: '%s'; Expected: '%s'", 2487 op.name, hc, actual, expected); 2488 } 2489 2490 string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex, size_t fieldIndex, 2491 const char[] actual, const char[] expected) 2492 { 2493 return format("[testSingleFieldOperator] Unexpected value. Operator: %s; Usecase: %s; RowIndex: %d, FieldIndex: %d\n Actual: '%s'; Expected: '%s'", 2494 op.name, hc, rowIndex, fieldIndex, actual, expected); 2495 } 2496 2497 /* Run the logic for each header use case. */ 2498 foreach (hc; EnumMembers!HeaderUsecase) 2499 { 2500 bool hasInputHeader = ( 2501 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2502 hc == HeaderUsecase.HeaderLine_CustomHeader 2503 ); 2504 bool hasOutputHeader = ( 2505 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2506 hc == HeaderUsecase.HeaderLine_CustomHeader || 2507 hc == HeaderUsecase.NoHeaderLine_DefaultHeader || 2508 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2509 ); 2510 bool hasCustomHeader = ( 2511 hc == HeaderUsecase.HeaderLine_CustomHeader || 2512 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2513 ); 2514 2515 if (hasCustomHeader) assert(hasOutputHeader); 2516 2517 auto op = new OperatorClass(fieldIndex, missingPolicy, extraOpInitArgs); 2518 2519 if (hasCustomHeader) 2520 { 2521 if (!op.allowCustomHeader) continue; // Custom header not support by this operator 2522 op.setCustomHeader(customOutputFieldHeader); 2523 } 2524 2525 Operator[] operatorArray; 2526 operatorArray ~= op; 2527 2528 auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy); 2529 summarizer.setOperators(inputRangeObject(operatorArray)); 2530 2531 if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine); 2532 2533 if (hasOutputHeader) 2534 { 2535 /* Write the header line. Note that this is a one-field header, */ 2536 auto headerLineOutput = appender!(char[])(); 2537 summarizer.writeSummaryHeader(headerLineOutput, printOptions); 2538 2539 /* Test that the header was generated correctly. 2540 * 2541 * Note: Because the output is generated by a Summarizer, it will have a 2542 * trailing newline. Use chomp to trim it. 2543 */ 2544 final switch (hc) 2545 { 2546 case HeaderUsecase.HeaderLine_DefaultHeader: 2547 assert(headerLineOutput.data.chomp == outputFieldHeaderFromHeaderLine, 2548 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2549 outputFieldHeaderFromHeaderLine)); 2550 break; 2551 case HeaderUsecase.NoHeaderLine_DefaultHeader: 2552 assert(headerLineOutput.data.chomp == outputFieldHeaderWithNoHeaderLine, 2553 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2554 outputFieldHeaderWithNoHeaderLine)); 2555 break; 2556 case HeaderUsecase.HeaderLine_CustomHeader: 2557 case HeaderUsecase.NoHeaderLine_CustomHeader: 2558 assert(headerLineOutput.data.chomp == customOutputFieldHeader, 2559 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2560 customOutputFieldHeader)); 2561 break; 2562 case HeaderUsecase.NoHeaderLine_NoOutputHeader: 2563 break; 2564 } 2565 2566 } 2567 2568 /* For each line, process the line, generate the output, and test that the 2569 * value is correct. Start with the empty file case. 2570 */ 2571 foreach (i, const char[] expected; expectedValues) 2572 { 2573 if (i > 0) summarizer.processNextLine(splitFile[i - 1]); 2574 auto summaryLineOutput = appender!(char[])(); 2575 summarizer.writeSummaryBody(summaryLineOutput, printOptions); 2576 assert(summaryLineOutput.data.chomp == expected, 2577 valueAssertMessage(operatorArray[0], hc, i, fieldIndex, 2578 summaryLineOutput.data.chomp, expectedValues[i])); 2579 } 2580 } 2581 } 2582 } 2583 2584 /** ZeroFieldOperator is a base class for operators that take no input. The main use 2585 * case is the CountOperator, which counts the occurrences of each unique key. Other 2586 * uses are possible, for example, weighted random number assignment. 2587 * 2588 * The primary rationale for ZeroFieldOperator and ZeroFieldCalculator is to clarify 2589 * the information available to such a routine. In particular, the split fields passed 2590 * to processHeaderLine and processNextLine don't include all fields in the input, 2591 * something that might not be obvious when implementing an operator. (Only fields 2592 * required by operators acting on specific fields are included.) 2593 */ 2594 class ZeroFieldOperator : Operator 2595 { 2596 import std.typecons : Flag; 2597 2598 private string _name; 2599 private string _header; 2600 2601 this(string operatorName) 2602 { 2603 _name = operatorName; 2604 _header = operatorName; 2605 } 2606 2607 void setCustomHeader (string customHeader) 2608 { 2609 _header = customHeader; 2610 } 2611 2612 bool allowCustomHeader() const @property 2613 { 2614 return true; 2615 } 2616 2617 final string name() const @property 2618 { 2619 return _name; 2620 } 2621 2622 final string header() const @property 2623 { 2624 return _header; 2625 } 2626 2627 /* A no-op. ZeroFieldOperators have no access to the header line. */ 2628 final void processHeaderLine(const char[][] fields) { } 2629 2630 /* A no-op. ZeroFieldOperators have no access to fields. */ 2631 final size_t[] numericFieldsToSave() 2632 { 2633 size_t[] emptyArray; 2634 return emptyArray; 2635 } 2636 2637 /* A no-op. ZeroFieldOperators have no access to fields. */ 2638 final size_t[] textFieldsToSave() 2639 { 2640 size_t[] emptyArray; 2641 return emptyArray; 2642 } 2643 2644 abstract ZeroFieldCalculator makeCalculator(); 2645 } 2646 2647 /** ZeroFieldCalculator is a base class for operators that don't use fields as input. 2648 * In particular, the Count operator. It is a companion to the ZeroFieldOperator class. 2649 * 2650 * Derived classes implement processNextEntry() rather than processNextLine(), and the 2651 * single argument form of calculate() given as an abstract function. 2652 */ 2653 class ZeroFieldCalculator : Calculator 2654 { 2655 this() { } 2656 2657 final void processNextLine(const char[][] fields) 2658 { 2659 debug writefln("[%s]", __FUNCTION__,); 2660 processNextEntry(); 2661 } 2662 2663 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2664 { 2665 return calculate(printOptions); 2666 } 2667 2668 abstract void processNextEntry(); 2669 abstract string calculate(const ref SummarizerPrintOptions printOptions); 2670 } 2671 2672 version(unittest) 2673 { 2674 /* A helper for ZeroFieldOperator unit tests. 2675 * 2676 * testZeroFieldOperator takes a set of split file values, a default header, and a 2677 * set of expected values. The expected values array contains the expected values 2678 * after each line. 2679 * 2680 * testZeroFieldOperator is very similar to testSingleFieldOperator, except that 2681 * there is no use of field indices and fewer types of headers. See the latter's 2682 * documentation and the CountOperator unit tests for examples. 2683 */ 2684 void testZeroFieldOperator(OperatorClass : ZeroFieldOperator) 2685 (const char[][][] splitFile, string defaultHeader, const char[][] expectedValues) 2686 { 2687 import std.format : format; 2688 import std.array : appender; 2689 import std.string : chomp; 2690 import std.traits : EnumMembers; 2691 2692 auto numFields = (splitFile[0]).length; 2693 2694 assert(splitFile.length + 1 == expectedValues.length, 2695 format("[testZeroFieldOperator] Need one more expected value than number of rows. headerSuffix: %s", 2696 defaultHeader)); 2697 2698 /* printOptions - Not used these tests, but needed for API calls. */ 2699 auto printOptions = SummarizerPrintOptions('#', '|'); 2700 2701 /* Missing policy doesn't apply to zero field operators, but need the object for the summarizer. */ 2702 auto missingPolicy = new MissingFieldPolicy; 2703 2704 /* An input header line. */ 2705 string[] inputHeaderLine = new string[numFields]; 2706 foreach (i; 0 .. numFields) inputHeaderLine[i] = "header" ~ i.to!string; 2707 2708 auto customOutputFieldHeader = "custom"; 2709 2710 enum HeaderUsecase { 2711 HeaderLine_DefaultHeader, 2712 HeaderLine_CustomHeader, 2713 NoHeaderLine_DefaultHeader, 2714 NoHeaderLine_CustomHeader, 2715 NoHeaderLine_NoOutputHeader, 2716 } 2717 2718 string headerAssertMessage(Operator op, HeaderUsecase hc, const char[] actual, const char[] expected) 2719 { 2720 return format("[testZeroFieldOperator] Unexpected header. Operator: %s; Usecase: %s; Actual: '%s'; Expected: '%s'", 2721 op.name, hc, actual, expected); 2722 } 2723 2724 string valueAssertMessage(Operator op, HeaderUsecase hc, size_t rowIndex, 2725 const char[] actual, const char[] expected) 2726 { 2727 return format("[testZeroFieldOperator] Unexpected value. Operator: %s; Usecase: %s; RowIndex: %d\n Actual: '%s'; Expected: '%s'", 2728 op.name, hc, rowIndex, actual, expected); 2729 } 2730 2731 /* Run the logic for each header use case. */ 2732 foreach (hc; EnumMembers!HeaderUsecase) 2733 { 2734 bool hasInputHeader = ( 2735 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2736 hc == HeaderUsecase.HeaderLine_CustomHeader 2737 ); 2738 bool hasOutputHeader = ( 2739 hc == HeaderUsecase.HeaderLine_DefaultHeader || 2740 hc == HeaderUsecase.HeaderLine_CustomHeader || 2741 hc == HeaderUsecase.NoHeaderLine_DefaultHeader || 2742 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2743 ); 2744 bool hasCustomHeader = ( 2745 hc == HeaderUsecase.HeaderLine_CustomHeader || 2746 hc == HeaderUsecase.NoHeaderLine_CustomHeader 2747 ); 2748 2749 if (hasCustomHeader) assert(hasOutputHeader); 2750 2751 auto op = new OperatorClass(); 2752 2753 if (hasCustomHeader) 2754 { 2755 if (!op.allowCustomHeader) continue; // Custom header not support by this operator 2756 op.setCustomHeader(customOutputFieldHeader); 2757 } 2758 2759 Operator[] operatorArray; 2760 operatorArray ~= op; 2761 2762 auto summarizer = new NoKeySummarizer!(typeof(appender!(char[])()))('#', missingPolicy); 2763 summarizer.setOperators(inputRangeObject(operatorArray)); 2764 if (hasInputHeader) summarizer.processHeaderLine(inputHeaderLine); 2765 2766 if (hasOutputHeader) 2767 { 2768 /* Write the header line. Note that this is a one-field header, */ 2769 auto headerLineOutput = appender!(char[])(); 2770 summarizer.writeSummaryHeader(headerLineOutput, printOptions); 2771 2772 /* Test that the header was generated correctly. 2773 * 2774 * Note: Because the output is generated by a Summarizer, it will have a 2775 * trailing newline. Use chomp to trim it. 2776 */ 2777 final switch (hc) 2778 { 2779 case HeaderUsecase.HeaderLine_DefaultHeader: 2780 case HeaderUsecase.NoHeaderLine_DefaultHeader: 2781 assert(headerLineOutput.data.chomp == defaultHeader, 2782 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2783 defaultHeader)); 2784 break; 2785 case HeaderUsecase.HeaderLine_CustomHeader: 2786 case HeaderUsecase.NoHeaderLine_CustomHeader: 2787 assert(headerLineOutput.data.chomp == customOutputFieldHeader, 2788 headerAssertMessage(operatorArray[0], hc, headerLineOutput.data.chomp, 2789 customOutputFieldHeader)); 2790 break; 2791 case HeaderUsecase.NoHeaderLine_NoOutputHeader: 2792 break; 2793 } 2794 2795 } 2796 2797 /* For each line, process the line, generate the output, and test that the 2798 * value is correct. Start with the empty file case. 2799 */ 2800 foreach (i, const char[] expected; expectedValues) 2801 { 2802 if (i > 0) summarizer.processNextLine(splitFile[i - 1]); 2803 auto summaryLineOutput = appender!(char[])(); 2804 summarizer.writeSummaryBody(summaryLineOutput, printOptions); 2805 assert(summaryLineOutput.data.chomp == expected, 2806 valueAssertMessage(operatorArray[0], hc, i, 2807 summaryLineOutput.data.chomp, expectedValues[i])); 2808 } 2809 } 2810 } 2811 } 2812 2813 /* Specific operators. 2814 * 2815 * Notes: 2816 * - The 'Calculator' inner classes are 'static'. This means inner class instances do not 2817 * keep a reference to the context of the outer class. In exchange, Calculator instances 2818 * need to hold all needed state, typically the field index they are summarizing. 2819 */ 2820 2821 /** CountOperator counts the number of occurrences of each unique key, or the number of 2822 * input lines if there is no unique key. 2823 * 2824 * CountOperator differs from most other operators in that it doesn't summarize a specific 2825 * field on the line. Instead it is summarizing a property of the unique key itself. For 2826 * this reason it doesn't derive from SingleFieldOperator. 2827 */ 2828 final class CountOperator : ZeroFieldOperator 2829 { 2830 this() 2831 { 2832 super("count"); 2833 } 2834 2835 final override ZeroFieldCalculator makeCalculator() 2836 { 2837 return new CountCalculator(); 2838 } 2839 2840 static final class CountCalculator : ZeroFieldCalculator 2841 { 2842 private size_t _count = 0; 2843 2844 final override void processNextEntry() 2845 { 2846 _count++; 2847 } 2848 2849 final override string calculate(const ref SummarizerPrintOptions printOptions) 2850 { 2851 return printOptions.formatNumber(_count); 2852 } 2853 } 2854 } 2855 2856 unittest // CountOperator 2857 { 2858 auto col1File = [["10"], ["9.5"], ["11"]]; 2859 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 2860 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 2861 2862 testZeroFieldOperator!CountOperator(col1File, "count", ["0", "1", "2", "3"]); 2863 testZeroFieldOperator!CountOperator(col2File, "count", ["0", "1", "2", "3"]); 2864 testZeroFieldOperator!CountOperator(col3File, "count", ["0", "1", "2", "3"]); 2865 } 2866 2867 /** RetainOperator retains the first occurrence of a field, without changing the header. 2868 * 2869 * RetainOperator is intended for fields where the value is expected to be the same for 2870 * all occurrences of the unique key, and the goal is to pass the value through unchanged. 2871 * It is like FirstOperator, except that the original header is preserved. The original 2872 * header preservation is setup in the call to the SingleFieldOperation constructor. 2873 * 2874 * Notes: 2875 * - An option to signal an error if multiple values are encountered might be useful. 2876 */ 2877 final class RetainOperator : SingleFieldOperator 2878 { 2879 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2880 { 2881 super("retain", fieldIndex, missingPolicy, No.useHeaderSuffix, No.allowCustomHeader); 2882 } 2883 2884 final override SingleFieldCalculator makeCalculator() 2885 { 2886 return new RetainCalculator(fieldIndex); 2887 } 2888 2889 final class RetainCalculator : SingleFieldCalculator 2890 { 2891 private bool _done = false; 2892 private string _value = ""; 2893 2894 this(size_t fieldIndex) 2895 { 2896 super(fieldIndex); 2897 } 2898 2899 final override RetainOperator getOperator() 2900 { 2901 return this.outer; 2902 } 2903 2904 final override void processNextField(const char[] nextField) 2905 { 2906 if (!_done) 2907 { 2908 _value = nextField.to!string; 2909 _done = true; 2910 } 2911 } 2912 2913 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2914 { 2915 return _value; 2916 } 2917 } 2918 } 2919 2920 unittest // RetainOperator 2921 { 2922 auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]]; 2923 auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]]; 2924 auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]]; 2925 2926 testSingleFieldOperator!RetainOperator(col1File, 0, "", ["", "r1c1", "r1c1", "r1c1"]); 2927 testSingleFieldOperator!RetainOperator(col2File, 0, "", ["", "r1c1", "r1c1", "r1c1"]); 2928 testSingleFieldOperator!RetainOperator(col2File, 1, "", ["", "r1c2", "r1c2", "r1c2"]); 2929 testSingleFieldOperator!RetainOperator(col3File, 0, "", ["", "r1c1", "r1c1", "r1c1"]); 2930 testSingleFieldOperator!RetainOperator(col3File, 1, "", ["", "r1c2", "r1c2", "r1c2"]); 2931 testSingleFieldOperator!RetainOperator(col3File, 2, "", ["", "r1c3", "r1c3", "r1c3"]); 2932 2933 auto col1misFile = [[""], ["r2c1"], ["r3c1"]]; 2934 testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "", "r2c1", "r2c1"], 2935 new MissingFieldPolicy(true, "")); // Exclude missing 2936 testSingleFieldOperator!RetainOperator(col1misFile, 0, "", ["", "NA", "NA", "NA"], 2937 new MissingFieldPolicy(false, "NA")); // Replace missing 2938 } 2939 2940 /** FirstOperator outputs the first value found for the field. 2941 */ 2942 final class FirstOperator : SingleFieldOperator 2943 { 2944 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 2945 { 2946 super("first", fieldIndex, missingPolicy); 2947 } 2948 2949 final override SingleFieldCalculator makeCalculator() 2950 { 2951 return new FirstCalculator(fieldIndex); 2952 } 2953 2954 final class FirstCalculator : SingleFieldCalculator 2955 { 2956 private bool _done = false; 2957 private string _value = ""; 2958 2959 this(size_t fieldIndex) 2960 { 2961 super(fieldIndex); 2962 } 2963 2964 final override FirstOperator getOperator() 2965 { 2966 return this.outer; 2967 } 2968 2969 final override void processNextField(const char[] nextField) 2970 { 2971 if (!_done) 2972 { 2973 _value = nextField.to!string; 2974 _done = true; 2975 } 2976 } 2977 2978 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 2979 { 2980 return _value; 2981 } 2982 } 2983 } 2984 2985 unittest // FirstOperator 2986 { 2987 auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]]; 2988 auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]]; 2989 auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]]; 2990 2991 testSingleFieldOperator!FirstOperator(col1File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]); 2992 testSingleFieldOperator!FirstOperator(col2File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]); 2993 testSingleFieldOperator!FirstOperator(col2File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]); 2994 testSingleFieldOperator!FirstOperator(col3File, 0, "first", ["", "r1c1", "r1c1", "r1c1"]); 2995 testSingleFieldOperator!FirstOperator(col3File, 1, "first", ["", "r1c2", "r1c2", "r1c2"]); 2996 testSingleFieldOperator!FirstOperator(col3File, 2, "first", ["", "r1c3", "r1c3", "r1c3"]); 2997 2998 auto col1misFile = [[""], ["r2c1"], ["r3c1"]]; 2999 testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "", "r2c1", "r2c1"], 3000 new MissingFieldPolicy(true, "")); // Exclude missing 3001 testSingleFieldOperator!FirstOperator(col1misFile, 0, "first", ["", "NA", "NA", "NA"], 3002 new MissingFieldPolicy(false, "NA")); // Replace missing 3003 } 3004 3005 /** LastOperator outputs the last value found for the field. 3006 */ 3007 final class LastOperator : SingleFieldOperator 3008 { 3009 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3010 { 3011 super("last", fieldIndex, missingPolicy); 3012 } 3013 3014 final override SingleFieldCalculator makeCalculator() 3015 { 3016 return new LastCalculator(fieldIndex); 3017 } 3018 3019 final class LastCalculator : SingleFieldCalculator 3020 { 3021 private string _value = ""; 3022 3023 this(size_t fieldIndex) 3024 { 3025 super(fieldIndex); 3026 } 3027 3028 final override LastOperator getOperator() 3029 { 3030 return this.outer; 3031 } 3032 3033 final override void processNextField(const char[] nextField) 3034 { 3035 _value = nextField.to!string; 3036 } 3037 3038 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3039 { 3040 return _value; 3041 } 3042 } 3043 } 3044 3045 unittest // LastOperator 3046 { 3047 auto col1File = [["r1c1"], ["r2c1"], ["r3c1"]]; 3048 auto col2File = [["r1c1", "r1c2"], ["r2c1", "r2c2"], ["r3c1", "r3c2"]]; 3049 auto col3File = [["r1c1", "r1c2", "r1c3"], ["r2c1", "r2c2", "r2c3"], ["r3c1", "r3c2", "r3c3"]]; 3050 3051 testSingleFieldOperator!LastOperator(col1File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]); 3052 testSingleFieldOperator!LastOperator(col2File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]); 3053 testSingleFieldOperator!LastOperator(col2File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]); 3054 testSingleFieldOperator!LastOperator(col3File, 0, "last", ["", "r1c1", "r2c1", "r3c1"]); 3055 testSingleFieldOperator!LastOperator(col3File, 1, "last", ["", "r1c2", "r2c2", "r3c2"]); 3056 testSingleFieldOperator!LastOperator(col3File, 2, "last", ["", "r1c3", "r2c3", "r3c3"]); 3057 3058 auto col1misFile = [[""], ["r2c1"], ["r3c1"]]; 3059 testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "", "r2c1", "r3c1"], 3060 new MissingFieldPolicy(true, "")); // Exclude missing 3061 testSingleFieldOperator!LastOperator(col1misFile, 0, "last", ["", "NA", "r2c1", "r3c1"], 3062 new MissingFieldPolicy(false, "NA")); // Replace missing 3063 } 3064 3065 /** MinOperator output the minimum value for the field. This is a numeric operator. 3066 * 3067 * This operator returns the original string without additional numeric formatting. 3068 * This can be useful when joining back to the original data. This is different than 3069 * numeric operators that perform calculations. 3070 */ 3071 final class MinOperator : SingleFieldOperator 3072 { 3073 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3074 { 3075 super("min", fieldIndex, missingPolicy); 3076 } 3077 3078 final override SingleFieldCalculator makeCalculator() 3079 { 3080 return new MinCalculator(fieldIndex); 3081 } 3082 3083 final class MinCalculator : SingleFieldCalculator 3084 { 3085 private bool _isFirst = true; 3086 private double _value = double.nan; 3087 private string _originalString = "nan"; // Note: Cannot format floats at compile time (version 2.087) 3088 3089 this(size_t fieldIndex) 3090 { 3091 super(fieldIndex); 3092 } 3093 3094 final override MinOperator getOperator() 3095 { 3096 return this.outer; 3097 } 3098 3099 final override void processNextField(const char[] nextField) 3100 { 3101 double fieldValue = nextField.to!double; 3102 if (_isFirst) 3103 { 3104 _value = fieldValue; 3105 _originalString = nextField.to!string; 3106 _isFirst = false; 3107 } 3108 else if (fieldValue < _value) 3109 { 3110 _value = fieldValue; 3111 _originalString = nextField.to!string; 3112 } 3113 } 3114 3115 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3116 { 3117 return _originalString; 3118 } 3119 } 3120 } 3121 3122 unittest // MinOperator 3123 { 3124 auto col1File = [["10"], ["9.5"], ["11"]]; 3125 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3126 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 3127 3128 testSingleFieldOperator!MinOperator(col1File, 0, "min", ["nan", "10", "9.5", "9.5"]); 3129 testSingleFieldOperator!MinOperator(col2File, 0, "min", ["nan", "20", "20", "20"]); 3130 testSingleFieldOperator!MinOperator(col2File, 1, "min", ["nan", "-30", "-30", "-31"]); 3131 testSingleFieldOperator!MinOperator(col3File, 0, "min", ["nan", "9009", "199", "199"]); 3132 testSingleFieldOperator!MinOperator(col3File, 1, "min", ["nan", "9", "0", "0"]); 3133 testSingleFieldOperator!MinOperator(col3File, 2, "min", ["nan", "-4.5", "-4.5", "-4.5"]); 3134 3135 auto col1misFile = [[""], ["10"], ["-10"]]; 3136 testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "nan", "10", "-10"], 3137 new MissingFieldPolicy(true, "")); // Exclude missing 3138 testSingleFieldOperator!MinOperator(col1misFile, 0, "min", ["nan", "5", "5", "-10"], 3139 new MissingFieldPolicy(false, "5")); // Replace missing 3140 } 3141 3142 /** MaxOperator output the maximum value for the field. This is a numeric operator. 3143 * 3144 * This operator returns the original string without additional numeric formatting. 3145 * This can be useful when joining back to the original data. This is different than 3146 * numeric operators that perform calculations. 3147 */ 3148 final class MaxOperator : SingleFieldOperator 3149 { 3150 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3151 { 3152 super("max", fieldIndex, missingPolicy); 3153 } 3154 3155 final override SingleFieldCalculator makeCalculator() 3156 { 3157 return new MaxCalculator(fieldIndex); 3158 } 3159 3160 final class MaxCalculator : SingleFieldCalculator 3161 { 3162 private bool _isFirst = true; 3163 private double _value = double.nan; 3164 private string _originalString = "nan"; // Note: Cannot format floats at compile time (version 2.087) 3165 3166 this(size_t fieldIndex) 3167 { 3168 super(fieldIndex); 3169 } 3170 3171 final override MaxOperator getOperator() 3172 { 3173 return this.outer; 3174 } 3175 3176 final override void processNextField(const char[] nextField) 3177 { 3178 double fieldValue = nextField.to!double; 3179 if (_isFirst) 3180 { 3181 _value = fieldValue; 3182 _originalString = nextField.to!string; 3183 _isFirst = false; 3184 } 3185 else if (fieldValue > _value) 3186 { 3187 _value = fieldValue; 3188 _originalString = nextField.to!string; 3189 } 3190 } 3191 3192 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3193 { 3194 return _originalString; 3195 } 3196 } 3197 } 3198 3199 unittest // MaxOperator 3200 { 3201 auto col1File = [["10"], ["9.5"], ["11"]]; 3202 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3203 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 3204 3205 testSingleFieldOperator!MaxOperator(col1File, 0, "max", ["nan", "10", "10", "11"]); 3206 testSingleFieldOperator!MaxOperator(col2File, 0, "max", ["nan", "20", "21", "22"]); 3207 testSingleFieldOperator!MaxOperator(col2File, 1, "max", ["nan", "-30", "-29", "-29"]); 3208 testSingleFieldOperator!MaxOperator(col3File, 0, "max", ["nan", "9009", "9009", "9009"]); 3209 testSingleFieldOperator!MaxOperator(col3File, 1, "max", ["nan", "9", "9", "9"]); 3210 testSingleFieldOperator!MaxOperator(col3File, 2, "max", ["nan", "-4.5", "-0.5", "12"]); 3211 3212 auto col1misFile = [[""], ["-10"], ["10"]]; 3213 testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "nan", "-10", "10"], 3214 new MissingFieldPolicy(true, "")); // Exclude missing 3215 testSingleFieldOperator!MaxOperator(col1misFile, 0, "max", ["nan", "5", "5", "10"], 3216 new MissingFieldPolicy(false, "5")); // Replace missing 3217 } 3218 3219 /** RangeOperator outputs the difference between the minimum and maximum values. 3220 * 3221 * If there is a single value, or all values are the same, the range is zero. This is 3222 * a numeric operator. 3223 */ 3224 final class RangeOperator : SingleFieldOperator 3225 { 3226 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3227 { 3228 super("range", fieldIndex, missingPolicy); 3229 } 3230 3231 final override SingleFieldCalculator makeCalculator() 3232 { 3233 return new RangeCalculator(fieldIndex); 3234 } 3235 3236 final class RangeCalculator : SingleFieldCalculator 3237 { 3238 private bool _isFirst = true; 3239 private double _minValue = 0.0; 3240 private double _maxValue = 0.0; 3241 3242 this(size_t fieldIndex) 3243 { 3244 super(fieldIndex); 3245 } 3246 3247 final override RangeOperator getOperator() 3248 { 3249 return this.outer; 3250 } 3251 3252 final override void processNextField(const char[] nextField) 3253 { 3254 double fieldValue = nextField.to!double; 3255 if (_isFirst) 3256 { 3257 _minValue = _maxValue = fieldValue; 3258 _isFirst = false; 3259 } 3260 else if (fieldValue > _maxValue) 3261 { 3262 _maxValue = fieldValue; 3263 } 3264 else if (fieldValue < _minValue) 3265 { 3266 _minValue = fieldValue; 3267 } 3268 } 3269 3270 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3271 { 3272 return printOptions.formatNumber(_maxValue - _minValue); 3273 } 3274 } 3275 } 3276 3277 unittest // RangeOperator 3278 { 3279 auto col1File = [["10"], ["9.5"], ["11"]]; 3280 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3281 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 3282 3283 testSingleFieldOperator!RangeOperator(col1File, 0, "range", ["0", "0", "0.5", "1.5"]); 3284 testSingleFieldOperator!RangeOperator(col2File, 0, "range", ["0", "0", "1", "2"]); 3285 testSingleFieldOperator!RangeOperator(col2File, 1, "range", ["0", "0", "1", "2"]); 3286 testSingleFieldOperator!RangeOperator(col3File, 0, "range", ["0", "0", "8810", "8810"]); 3287 testSingleFieldOperator!RangeOperator(col3File, 1, "range", ["0", "0", "9", "9"]); 3288 testSingleFieldOperator!RangeOperator(col3File, 2, "range", ["0", "0", "4", "16.5"]); 3289 3290 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]]; 3291 testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "0", "0", "0.5", "1.5"], 3292 new MissingFieldPolicy(true, "")); // Exclude missing 3293 testSingleFieldOperator!RangeOperator(col1misFile, 0, "range", ["0", "0", "4.5", "4.5", "4.5", "5.5"], 3294 new MissingFieldPolicy(false, "5.5")); // Replace missing 3295 } 3296 3297 /** SumOperator produces the sum of all the values. This is a numeric operator. 3298 */ 3299 final class SumOperator : SingleFieldOperator 3300 { 3301 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3302 { 3303 super("sum", fieldIndex, missingPolicy); 3304 } 3305 3306 final override SingleFieldCalculator makeCalculator() 3307 { 3308 return new SumCalculator(fieldIndex); 3309 } 3310 3311 final class SumCalculator : SingleFieldCalculator 3312 { 3313 private double _total = 0.0; 3314 3315 this(size_t fieldIndex) 3316 { 3317 super(fieldIndex); 3318 } 3319 3320 final override SumOperator getOperator() 3321 { 3322 return this.outer; 3323 } 3324 3325 final override void processNextField(const char[] nextField) 3326 { 3327 _total += nextField.to!double; 3328 } 3329 3330 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3331 { 3332 return printOptions.formatNumber(_total); 3333 } 3334 } 3335 } 3336 3337 unittest // SumOperator 3338 { 3339 auto col1File = [["10"], ["9.5"], ["11"]]; 3340 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3341 auto col3File = [["9009", "9", "-4.5"], ["199", "0", "-0.5"], ["3003", "0.2", "12"]]; 3342 3343 testSingleFieldOperator!SumOperator(col1File, 0, "sum", ["0", "10", "19.5", "30.5"]); 3344 testSingleFieldOperator!SumOperator(col2File, 0, "sum", ["0", "20", "41", "63"]); 3345 testSingleFieldOperator!SumOperator(col2File, 1, "sum", ["0", "-30", "-59", "-90"]); 3346 testSingleFieldOperator!SumOperator(col3File, 0, "sum", ["0", "9009", "9208", "12211"]); 3347 testSingleFieldOperator!SumOperator(col3File, 1, "sum", ["0", "9", "9", "9.2"]); 3348 testSingleFieldOperator!SumOperator(col3File, 2, "sum", ["0", "-4.5", "-5", "7"]); 3349 3350 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["11"]]; 3351 testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "0", "10", "10", "19.5", "30.5"], 3352 new MissingFieldPolicy(true, "")); // Exclude missing 3353 testSingleFieldOperator!SumOperator(col1misFile, 0, "sum", ["0", "1.5", "11.5", "13", "22.5", "33.5"], 3354 new MissingFieldPolicy(false, "1.5")); // Replace missing 3355 } 3356 3357 /** MeanOperator produces the mean (average) of all the values. This is a numeric operator. 3358 */ 3359 final class MeanOperator : SingleFieldOperator 3360 { 3361 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3362 { 3363 super("mean", fieldIndex, missingPolicy); 3364 } 3365 3366 final override SingleFieldCalculator makeCalculator() 3367 { 3368 return new MeanCalculator(fieldIndex); 3369 } 3370 3371 final class MeanCalculator : SingleFieldCalculator 3372 { 3373 private double _total = 0.0; 3374 private size_t _count = 0; 3375 3376 this(size_t fieldIndex) 3377 { 3378 super(fieldIndex); 3379 } 3380 3381 final override MeanOperator getOperator() 3382 { 3383 return this.outer; 3384 } 3385 3386 final override void processNextField(const char[] nextField) 3387 { 3388 _total += nextField.to!double; 3389 _count++; 3390 } 3391 3392 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3393 { 3394 return printOptions.formatNumber( 3395 (_count > 0) ? (_total / _count.to!double) : double.nan); 3396 } 3397 } 3398 } 3399 3400 unittest // MeanOperator 3401 { 3402 auto col1File = [["10"], ["9.5"], ["7.5"]]; 3403 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3404 auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]]; 3405 3406 testSingleFieldOperator!MeanOperator(col1File, 0, "mean", ["nan", "10", "9.75", "9"]); 3407 testSingleFieldOperator!MeanOperator(col2File, 0, "mean", ["nan", "20", "20.5", "21"]); 3408 testSingleFieldOperator!MeanOperator(col2File, 1, "mean", ["nan", "-30", "-29.5", "-30"]); 3409 testSingleFieldOperator!MeanOperator(col3File, 0, "mean", ["nan", "9009", "4509", "4509"]); 3410 testSingleFieldOperator!MeanOperator(col3File, 1, "mean", ["nan", "9", "4.5", "2"]); 3411 testSingleFieldOperator!MeanOperator(col3File, 2, "mean", ["nan", "-4.5", "-3", "2"]); 3412 3413 auto col1misFile = [[""], ["6"], [""], ["14"], ["40"]]; 3414 testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "nan", "6", "6", "10", "20"], 3415 new MissingFieldPolicy(true, "")); // Exclude missing 3416 testSingleFieldOperator!MeanOperator(col1misFile, 0, "mean", ["nan", "0", "3", "2", "5", "12"], 3417 new MissingFieldPolicy(false, "0")); // Replace missing 3418 } 3419 3420 /** MedianOperator produces the median of all the values. This is a numeric operator. 3421 * 3422 * All the field values are stored in memory as part of this calculation. This is 3423 * handled by unique key value lists. 3424 */ 3425 final class MedianOperator : SingleFieldOperator 3426 { 3427 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3428 { 3429 super("median", fieldIndex, missingPolicy); 3430 setSaveFieldValuesNumeric(); 3431 } 3432 3433 final override SingleFieldCalculator makeCalculator() 3434 { 3435 return new MedianCalculator(fieldIndex); 3436 } 3437 3438 final class MedianCalculator : SingleFieldCalculator 3439 { 3440 this(size_t fieldIndex) 3441 { 3442 super(fieldIndex); 3443 } 3444 3445 final override MedianOperator getOperator() 3446 { 3447 return this.outer; 3448 } 3449 3450 /* Work is done by saving the field values. */ 3451 final override void processNextField(const char[] nextField) 3452 { } 3453 3454 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3455 { 3456 return printOptions.formatNumber(valuesLists.numericValuesMedian(fieldIndex)); 3457 } 3458 } 3459 } 3460 3461 unittest // MedianOperator 3462 { 3463 auto col1File = [["10"], ["9.5"], ["7.5"]]; 3464 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3465 auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]]; 3466 3467 testSingleFieldOperator!MedianOperator(col1File, 0, "median", ["nan", "10", "9.75", "9.5"]); 3468 testSingleFieldOperator!MedianOperator(col2File, 0, "median", ["nan", "20", "20.5", "21"]); 3469 testSingleFieldOperator!MedianOperator(col2File, 1, "median", ["nan", "-30", "-29.5", "-30"]); 3470 testSingleFieldOperator!MedianOperator(col3File, 0, "median", ["nan", "9009", "4509", "4509"]); 3471 testSingleFieldOperator!MedianOperator(col3File, 1, "median", ["nan", "9", "4.5", "0"]); 3472 testSingleFieldOperator!MedianOperator(col3File, 2, "median", ["nan", "-4.5", "-3", "-1.5"]); 3473 3474 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]]; 3475 testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "nan", "10", "10", "9.75", "9.5"], 3476 new MissingFieldPolicy(true, "")); // Exclude missing 3477 testSingleFieldOperator!MedianOperator(col1misFile, 0, "median", ["nan", "0", "5", "0", "4.75", "7.5"], 3478 new MissingFieldPolicy(false, "0")); // Replace missing 3479 } 3480 3481 /** QuantileOperator produces the value representing the data at a cummulative probability. 3482 * This is a numeric operation. 3483 * 3484 * As an example, quantiles might be produced for the 0.25, 0.5, and 0.75 probabilities 3485 * (alternately, the 25th, 50th, and 75th percentile ranks, the 50th percentile being the 3486 * median). Data is sorted is ascending order. This operator takes one percentile, but it 3487 * is common to generate multiple quantile ranks for the same field when summarizing. 3488 * 3489 * All the field's values are stored in memory as part of this calculation. This is 3490 * handled by unique key value lists. 3491 */ 3492 final class QuantileOperator : SingleFieldOperator 3493 { 3494 private double _prob; 3495 3496 this(size_t fieldIndex, MissingFieldPolicy missingPolicy, double probability) 3497 { 3498 assert(0.0 <= probability && probability <= 1.0); 3499 import std.format : format; 3500 3501 string header = (probability == 0.0) ? "pct0" : format("pct%02g", probability * 100.0); 3502 super(header, fieldIndex, missingPolicy); 3503 _prob = probability; 3504 setSaveFieldValuesNumeric(); 3505 } 3506 3507 final override SingleFieldCalculator makeCalculator() 3508 { 3509 return new QuantileCalculator(fieldIndex); 3510 } 3511 3512 final class QuantileCalculator : SingleFieldCalculator 3513 { 3514 this(size_t fieldIndex) 3515 { 3516 super(fieldIndex); 3517 } 3518 3519 final override QuantileOperator getOperator() 3520 { 3521 return this.outer; 3522 } 3523 3524 /* Work is done by saving the field values. */ 3525 final override void processNextField(const char[] nextField) 3526 { } 3527 3528 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3529 { 3530 import tsv_utils.common.numerics : quantile; 3531 return printOptions.formatNumber( 3532 quantile(this.outer._prob, valuesLists.numericValuesSorted(fieldIndex))); 3533 } 3534 } 3535 } 3536 3537 unittest // QuantileOperator 3538 { 3539 auto col1File = [["10"], ["9.5"], ["7.5"]]; 3540 auto col2File = [["20", "-30"], ["21", "-29"], ["22", "-31"]]; 3541 auto col3File = [["9009", "9", "-4.5"], ["9", "0", "-1.5"], ["4509", "-3", "12"]]; 3542 3543 auto defaultMissing = new MissingFieldPolicy; 3544 3545 /* Same as the median tests. */ 3546 testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct50", ["nan", "10", "9.75", "9.5"], defaultMissing, 0.50); 3547 testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct50", ["nan", "20", "20.5", "21"], defaultMissing, 0.50); 3548 testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct50", ["nan", "-30", "-29.5", "-30"], defaultMissing, 0.50); 3549 testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct50", ["nan", "9009", "4509", "4509"], defaultMissing, 0.50); 3550 testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct50", ["nan", "9", "4.5", "0"], defaultMissing, 0.50); 3551 testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct50", ["nan", "-4.5", "-3", "-1.5"], defaultMissing, 0.50); 3552 3553 /* The extremes (0, 1), are min and max. */ 3554 testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct0", ["nan", "10", "9.5", "7.5"], defaultMissing, 0.0); 3555 testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct0", ["nan", "20", "20", "20"], defaultMissing, 0.0); 3556 testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct0", ["nan", "-30", "-30", "-31"], defaultMissing, 0.0); 3557 testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct0", ["nan", "9009", "9", "9"], defaultMissing, 0.0); 3558 testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct0", ["nan", "9", "0", "-3"], defaultMissing, 0.0); 3559 testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct0", ["nan", "-4.5", "-4.5", "-4.5"], defaultMissing, 0.0); 3560 3561 testSingleFieldOperatorBase!QuantileOperator(col1File, 0, "pct100", ["nan", "10", "10", "10"], defaultMissing, 1.0); 3562 testSingleFieldOperatorBase!QuantileOperator(col2File, 0, "pct100", ["nan", "20", "21", "22"], defaultMissing, 1.0); 3563 testSingleFieldOperatorBase!QuantileOperator(col2File, 1, "pct100", ["nan", "-30", "-29", "-29"], defaultMissing, 1.0); 3564 testSingleFieldOperatorBase!QuantileOperator(col3File, 0, "pct100", ["nan", "9009", "9009", "9009"], defaultMissing, 1.0); 3565 testSingleFieldOperatorBase!QuantileOperator(col3File, 1, "pct100", ["nan", "9", "9", "9"], defaultMissing, 1.0); 3566 testSingleFieldOperatorBase!QuantileOperator(col3File, 2, "pct100", ["nan", "-4.5", "-1.5", "12"], defaultMissing, 1.0); 3567 3568 /* For missing policies, re-use the median tests. */ 3569 auto col1misFile = [[""], ["10"], [""], ["9.5"], ["7.5"]]; 3570 testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "nan", "10", "10", "9.75", "9.5"], 3571 new MissingFieldPolicy(true, ""), 0.5); // Exclude missing 3572 testSingleFieldOperatorBase!QuantileOperator(col1misFile, 0, "pct50", ["nan", "0", "5", "0", "4.75", "7.5"], 3573 new MissingFieldPolicy(false, "0"), 0.5); // Replace missing 3574 } 3575 3576 /** MadOperator produces the median absolute deviation from the median. This is a numeric 3577 * operation. 3578 * 3579 * The result is the raw MAD value, without a normalization applied. 3580 * 3581 * All the field values are stored in memory as part of this calculation. This is 3582 * handled by unique key value lists. 3583 */ 3584 final class MadOperator : SingleFieldOperator 3585 { 3586 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3587 { 3588 super("mad", fieldIndex, missingPolicy); 3589 setSaveFieldValuesNumeric(); 3590 } 3591 3592 final override SingleFieldCalculator makeCalculator() 3593 { 3594 return new MadCalculator(fieldIndex); 3595 } 3596 3597 final class MadCalculator : SingleFieldCalculator 3598 { 3599 this(size_t fieldIndex) 3600 { 3601 super(fieldIndex); 3602 } 3603 3604 final override MadOperator getOperator() 3605 { 3606 return this.outer; 3607 } 3608 3609 /* Work is done by saving the field values. */ 3610 final override void processNextField(const char[] nextField) 3611 { } 3612 3613 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3614 { 3615 import std.math : abs; 3616 import tsv_utils.common.numerics : rangeMedian; 3617 3618 auto median = valuesLists.numericValuesMedian(fieldIndex); 3619 auto values = valuesLists.numericValues(fieldIndex); 3620 auto medianDevs = new double[values.length]; 3621 foreach (size_t i, double v; values) 3622 medianDevs[i] = abs(v - median); 3623 3624 return printOptions.formatNumber(medianDevs.rangeMedian); 3625 } 3626 } 3627 } 3628 3629 unittest // MadOperator 3630 { 3631 auto col1File = [["10"], ["15"], ["20"], ["25"], ["30"]]; 3632 auto col2File = [["2", "50"], ["2", "51"], ["2", "52"]]; 3633 auto col3File = [["16", "8", "-4"], ["8", "8", "-2"], ["8", "16", "0"]]; 3634 3635 testSingleFieldOperator!MadOperator(col1File, 0, "mad", ["nan", "0", "2.5", "5", "5", "5"]); 3636 testSingleFieldOperator!MadOperator(col2File, 0, "mad", ["nan", "0", "0", "0"]); 3637 testSingleFieldOperator!MadOperator(col2File, 1, "mad", ["nan", "0", "0.5", "1"]); 3638 testSingleFieldOperator!MadOperator(col3File, 0, "mad", ["nan", "0", "4", "0"]); 3639 testSingleFieldOperator!MadOperator(col3File, 1, "mad", ["nan", "0", "0", "0"]); 3640 testSingleFieldOperator!MadOperator(col3File, 2, "mad", ["nan", "0", "1", "2"]); 3641 3642 auto col1misFile = [[""], ["16"], [""], ["32"], ["-4"]]; 3643 testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "nan", "0", "0", "8", "16"], 3644 new MissingFieldPolicy(true, "")); // Exclude missing 3645 testSingleFieldOperator!MadOperator(col1misFile, 0, "mad", ["nan", "0", "8", "0", "8", "4"], 3646 new MissingFieldPolicy(false, "0")); // Replace missing 3647 } 3648 3649 /** Generates the variance of the fields values. This is a numeric operator. 3650 */ 3651 final class VarianceOperator : SingleFieldOperator 3652 { 3653 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3654 { 3655 super("var", fieldIndex, missingPolicy); 3656 } 3657 3658 final override SingleFieldCalculator makeCalculator() 3659 { 3660 return new VarianceCalculator(fieldIndex); 3661 } 3662 3663 final class VarianceCalculator : SingleFieldCalculator 3664 { 3665 private double _count = 0.0; 3666 private double _mean = 0.0; 3667 private double _m2 = 0.0; // Sum of squares of differences from current mean 3668 3669 this(size_t fieldIndex) 3670 { 3671 super(fieldIndex); 3672 } 3673 3674 final override VarianceOperator getOperator() 3675 { 3676 return this.outer; 3677 } 3678 3679 final override void processNextField(const char[] nextField) 3680 { 3681 _count += 1.0; 3682 double fieldValue = nextField.to!double; 3683 double delta = fieldValue - _mean; 3684 _mean += delta / _count; 3685 _m2 += delta * (fieldValue - _mean); 3686 } 3687 3688 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3689 { 3690 return printOptions.formatNumber( 3691 (_count >= 2.0) ? (_m2 / (_count - 1.0)) : double.nan); 3692 } 3693 } 3694 } 3695 3696 unittest // VarianceOperator 3697 { 3698 auto col1File = [["5"], ["10"], ["15"]]; 3699 auto col2File = [["-5", "-5"], ["-10", "0"], ["-15", "5"]]; 3700 auto col3File = [["1", "2", "100"], ["2", "3", "100"], ["3", "4", "103"]]; 3701 3702 testSingleFieldOperator!VarianceOperator(col1File, 0, "var", ["nan", "nan", "12.5", "25"]); 3703 testSingleFieldOperator!VarianceOperator(col2File, 0, "var", ["nan", "nan", "12.5", "25"]); 3704 testSingleFieldOperator!VarianceOperator(col2File, 1, "var", ["nan", "nan", "12.5", "25"]); 3705 testSingleFieldOperator!VarianceOperator(col3File, 0, "var", ["nan", "nan", "0.5", "1"]); 3706 testSingleFieldOperator!VarianceOperator(col3File, 1, "var", ["nan", "nan", "0.5", "1"]); 3707 testSingleFieldOperator!VarianceOperator(col3File, 2, "var", ["nan", "nan", "0", "3"]); 3708 3709 auto col1misFile = [["5"], ["10"], [""]]; 3710 testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "12.5"], 3711 new MissingFieldPolicy(true, "")); // Exclude missing 3712 testSingleFieldOperator!VarianceOperator(col1misFile, 0, "var", ["nan", "nan", "12.5", "25"], 3713 new MissingFieldPolicy(false, "15")); // Replace missing 3714 } 3715 3716 /** Generates the standard deviation of the fields values. This is a numeric operator. 3717 */ 3718 final class StDevOperator : SingleFieldOperator 3719 { 3720 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3721 { 3722 super("stdev", fieldIndex, missingPolicy); 3723 } 3724 3725 final override SingleFieldCalculator makeCalculator() 3726 { 3727 return new StDevCalculator(fieldIndex); 3728 } 3729 3730 final class StDevCalculator : SingleFieldCalculator 3731 { 3732 private double _count = 0.0; 3733 private double _mean = 0.0; 3734 private double _m2 = 0.0; // Sum of squares of differences from current mean 3735 3736 this(size_t fieldIndex) 3737 { 3738 super(fieldIndex); 3739 } 3740 3741 final override StDevOperator getOperator() 3742 { 3743 return this.outer; 3744 } 3745 3746 final override void processNextField(const char[] nextField) 3747 { 3748 _count += 1.0; 3749 double fieldValue = nextField.to!double; 3750 double delta = fieldValue - _mean; 3751 _mean += delta / _count; 3752 _m2 += delta * (fieldValue - _mean); 3753 } 3754 3755 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3756 { 3757 import std.math : sqrt; 3758 return printOptions.formatNumber( 3759 (_count >= 2.0) ? (_m2 / (_count - 1.0)).sqrt : double.nan); 3760 } 3761 } 3762 } 3763 3764 /* StDevOperator unit tests - These would be improved with a tolerance option. 3765 */ 3766 unittest 3767 { 3768 auto col1File = [["1"], ["4"], ["7"]]; 3769 auto col2File = [["3", "3"], ["3", "9"], ["7", "15"]]; 3770 auto col3File = [["11", "10", "10"], ["24", "22", "25"], ["37", "34", "40"]]; 3771 3772 testSingleFieldOperator!StDevOperator(col1File, 0, "stdev", ["nan", "nan", "2.12132034356", "3"]); 3773 testSingleFieldOperator!StDevOperator(col2File, 0, "stdev", ["nan", "nan", "0", "2.30940107676"]); 3774 testSingleFieldOperator!StDevOperator(col2File, 1, "stdev", ["nan", "nan", "4.24264068712", "6"]); 3775 testSingleFieldOperator!StDevOperator(col3File, 0, "stdev", ["nan", "nan", "9.19238815543", "13"]); 3776 testSingleFieldOperator!StDevOperator(col3File, 1, "stdev", ["nan", "nan", "8.48528137424", "12"]); 3777 testSingleFieldOperator!StDevOperator(col3File, 2, "stdev", ["nan", "nan", "10.6066017178", "15"]); 3778 3779 auto col1misFile = [["1"], ["4"], [""]]; 3780 testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "2.12132034356"], 3781 new MissingFieldPolicy(true, "")); // Exclude missing 3782 testSingleFieldOperator!StDevOperator(col1misFile, 0, "stdev", ["nan", "nan", "2.12132034356", "3"], 3783 new MissingFieldPolicy(false, "7")); // Replace missing 3784 } 3785 3786 /** UniqueCountOperator generates the number of unique values. Unique values are 3787 * based on exact text match calculation, not a numeric comparison. 3788 * 3789 * All the unique field values are stored in memory as part of this calculation. 3790 */ 3791 final class UniqueCountOperator : SingleFieldOperator 3792 { 3793 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3794 { 3795 super("unique_count", fieldIndex, missingPolicy); 3796 } 3797 3798 final override SingleFieldCalculator makeCalculator() 3799 { 3800 return new UniqueCountCalculator(fieldIndex); 3801 } 3802 3803 final class UniqueCountCalculator : SingleFieldCalculator 3804 { 3805 private bool[string] _values; 3806 3807 this(size_t fieldIndex) 3808 { 3809 super(fieldIndex); 3810 } 3811 3812 final override UniqueCountOperator getOperator() 3813 { 3814 return this.outer; 3815 } 3816 3817 final override void processNextField(const char[] nextField) 3818 { 3819 if (nextField !in _values) _values[nextField.to!string] = true; 3820 } 3821 3822 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3823 { 3824 return printOptions.formatNumber(_values.length); 3825 } 3826 } 3827 } 3828 3829 unittest // UniqueCount 3830 { 3831 auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]]; 3832 auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]]; 3833 auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]]; 3834 3835 testSingleFieldOperator!UniqueCountOperator(col1File, 0, "unique_count", ["0", "1", "2", "3", "3", "3", "3", "3", "4"]); 3836 testSingleFieldOperator!UniqueCountOperator(col2File, 0, "unique_count", ["0", "1", "2", "2"]); 3837 testSingleFieldOperator!UniqueCountOperator(col2File, 1, "unique_count", ["0", "1", "1", "2"]); 3838 testSingleFieldOperator!UniqueCountOperator(col3File, 0, "unique_count", ["0", "1", "2", "3"]); 3839 testSingleFieldOperator!UniqueCountOperator(col3File, 1, "unique_count", ["0", "1", "2", "2"]); 3840 testSingleFieldOperator!UniqueCountOperator(col3File, 2, "unique_count", ["0", "1", "2", "3"]); 3841 3842 auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"], ["ab"]]; 3843 testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "0", "1", "1", "2", "3", "3", "3", "3", "3", "4"], 3844 new MissingFieldPolicy(true, "")); // Exclude missing 3845 3846 3847 testSingleFieldOperator!UniqueCountOperator(col1misFile, 0, "unique_count", ["0", "1", "2", "2", "3", "4", "4", "4", "4", "4", "5"], 3848 new MissingFieldPolicy(false, "XYZ")); // Replace missing 3849 } 3850 3851 /** MissingCountOperator generates the number of missing values. This overrides 3852 * the global missingFieldsPolicy. 3853 */ 3854 final class MissingCountOperator : SingleFieldOperator 3855 { 3856 private MissingFieldPolicy _globalMissingPolicy; 3857 3858 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3859 { 3860 _globalMissingPolicy = missingPolicy; 3861 super("missing_count", fieldIndex, new MissingFieldPolicy(false, "")); 3862 } 3863 3864 final override SingleFieldCalculator makeCalculator() 3865 { 3866 return new MissingCountCalculator(fieldIndex); 3867 } 3868 3869 final class MissingCountCalculator : SingleFieldCalculator 3870 { 3871 private size_t _missingCount = 0; 3872 3873 this(size_t fieldIndex) 3874 { 3875 super(fieldIndex); 3876 } 3877 3878 final override MissingCountOperator getOperator() 3879 { 3880 return this.outer; 3881 } 3882 3883 final override void processNextField(const char[] nextField) 3884 { 3885 if (this.outer._globalMissingPolicy.isMissingField(nextField)) _missingCount++; 3886 } 3887 3888 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3889 { 3890 return printOptions.formatNumber(_missingCount); 3891 } 3892 } 3893 } 3894 3895 unittest // MissingCount 3896 { 3897 auto col1File = [["a"], ["b"], [""], [" "], [""]]; 3898 auto col2File = [["abc", ""], ["", ""], ["def", ""]]; 3899 auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]]; 3900 3901 testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"]); 3902 testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"]); 3903 testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"]); 3904 testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"]); 3905 testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"]); 3906 testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"]); 3907 3908 auto excludeMissing = new MissingFieldPolicy(true, ""); 3909 auto replaceMissing = new MissingFieldPolicy(false, "X"); 3910 3911 testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], excludeMissing); 3912 testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], excludeMissing); 3913 testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], excludeMissing); 3914 testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], excludeMissing); 3915 testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], excludeMissing); 3916 testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], excludeMissing); 3917 3918 testSingleFieldOperator!MissingCountOperator(col1File, 0, "missing_count", ["0", "0", "0", "1", "1", "2"], replaceMissing); 3919 testSingleFieldOperator!MissingCountOperator(col2File, 0, "missing_count", ["0", "0", "1", "1"], replaceMissing); 3920 testSingleFieldOperator!MissingCountOperator(col2File, 1, "missing_count", ["0", "1", "2", "3"], replaceMissing); 3921 testSingleFieldOperator!MissingCountOperator(col3File, 0, "missing_count", ["0", "1", "1", "1"], replaceMissing); 3922 testSingleFieldOperator!MissingCountOperator(col3File, 1, "missing_count", ["0", "0", "1", "2"], replaceMissing); 3923 testSingleFieldOperator!MissingCountOperator(col3File, 2, "missing_count", ["0", "0", "0", "1"], replaceMissing); 3924 } 3925 3926 /** NotMissingCountOperator generates the number of not-missing values. This overrides 3927 * the global missingFieldsPolicy. 3928 */ 3929 final class NotMissingCountOperator : SingleFieldOperator 3930 { 3931 private MissingFieldPolicy _globalMissingPolicy; 3932 3933 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 3934 { 3935 _globalMissingPolicy = missingPolicy; 3936 super("not_missing_count", fieldIndex, new MissingFieldPolicy(false, "")); 3937 } 3938 3939 final override SingleFieldCalculator makeCalculator() 3940 { 3941 return new NotMissingCountCalculator(fieldIndex); 3942 } 3943 3944 final class NotMissingCountCalculator : SingleFieldCalculator 3945 { 3946 private size_t _notMissingCount = 0; 3947 3948 this(size_t fieldIndex) 3949 { 3950 super(fieldIndex); 3951 } 3952 3953 final override NotMissingCountOperator getOperator() 3954 { 3955 return this.outer; 3956 } 3957 3958 final override void processNextField(const char[] nextField) 3959 { 3960 if (!this.outer._globalMissingPolicy.isMissingField(nextField)) _notMissingCount++; 3961 } 3962 3963 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 3964 { 3965 return printOptions.formatNumber(_notMissingCount); 3966 } 3967 } 3968 } 3969 3970 unittest // NotMissingCount 3971 { 3972 auto col1File = [["a"], ["b"], [""], [" "], [""]]; 3973 auto col2File = [["abc", ""], ["", ""], ["def", ""]]; 3974 auto col3File = [["", "1", "a"], ["2.0", "", "1"], ["2", "", ""]]; 3975 3976 testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"]); 3977 testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"]); 3978 testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"]); 3979 testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"]); 3980 testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"]); 3981 testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"]); 3982 3983 auto excludeMissing = new MissingFieldPolicy(true, ""); 3984 auto replaceMissing = new MissingFieldPolicy(false, "X"); 3985 3986 testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], excludeMissing); 3987 testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], excludeMissing); 3988 testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], excludeMissing); 3989 testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], excludeMissing); 3990 testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], excludeMissing); 3991 testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], excludeMissing); 3992 3993 testSingleFieldOperator!NotMissingCountOperator(col1File, 0, "not_missing_count", ["0", "1", "2", "2", "3", "3"], replaceMissing); 3994 testSingleFieldOperator!NotMissingCountOperator(col2File, 0, "not_missing_count", ["0", "1", "1", "2"], replaceMissing); 3995 testSingleFieldOperator!NotMissingCountOperator(col2File, 1, "not_missing_count", ["0", "0", "0", "0"], replaceMissing); 3996 testSingleFieldOperator!NotMissingCountOperator(col3File, 0, "not_missing_count", ["0", "0", "1", "2"], replaceMissing); 3997 testSingleFieldOperator!NotMissingCountOperator(col3File, 1, "not_missing_count", ["0", "1", "1", "1"], replaceMissing); 3998 testSingleFieldOperator!NotMissingCountOperator(col3File, 2, "not_missing_count", ["0", "1", "2", "2"], replaceMissing); 3999 } 4000 4001 /** ModeOperator outputs the most frequent value seen. In the event of a tie, the 4002 * first value seen is produced. 4003 * 4004 * All the field values are stored in memory as part of this calculation. 4005 * 4006 */ 4007 final class ModeOperator : SingleFieldOperator 4008 { 4009 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 4010 { 4011 super("mode", fieldIndex, missingPolicy); 4012 } 4013 4014 final override SingleFieldCalculator makeCalculator() 4015 { 4016 return new ModeCalculator(fieldIndex); 4017 } 4018 4019 final class ModeCalculator : SingleFieldCalculator 4020 { 4021 private size_t[string] _valueCounts; 4022 private Appender!(string[]) _uniqueValues; 4023 4024 this(size_t fieldIndex) 4025 { 4026 super(fieldIndex); 4027 } 4028 4029 final override ModeOperator getOperator() 4030 { 4031 return this.outer; 4032 } 4033 4034 final override void processNextField(const char[] nextField) 4035 { 4036 auto countPtr = (nextField in _valueCounts); 4037 4038 if (countPtr is null) 4039 { 4040 string value = nextField.to!string; 4041 _uniqueValues.put(value); 4042 _valueCounts[value] = 1; 4043 } 4044 else 4045 { 4046 (*countPtr)++; 4047 } 4048 } 4049 4050 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 4051 { 4052 string modeValue = ""; 4053 size_t modeCount = 0; 4054 4055 foreach (value; _uniqueValues.data) 4056 { 4057 assert(value in _valueCounts); 4058 4059 auto count = _valueCounts[value]; 4060 4061 if (count > modeCount) 4062 { 4063 modeValue = value; 4064 modeCount = count; 4065 } 4066 } 4067 4068 return modeValue; 4069 } 4070 } 4071 } 4072 4073 unittest // ModeOperator 4074 { 4075 auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]]; 4076 auto col2File = [["abc", "pqr"], ["def", "pqr"], ["def", "xyz"]]; 4077 auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]]; 4078 4079 testSingleFieldOperator!ModeOperator(col1File, 0, "mode", ["", "a", "a", "a", "c", "b", "b", "b"]); 4080 testSingleFieldOperator!ModeOperator(col2File, 0, "mode", ["", "abc", "abc", "def"]); 4081 testSingleFieldOperator!ModeOperator(col2File, 1, "mode", ["", "pqr", "pqr", "pqr"]); 4082 testSingleFieldOperator!ModeOperator(col3File, 0, "mode", ["", "1.0", "1.0", "1.0"]); 4083 testSingleFieldOperator!ModeOperator(col3File, 1, "mode", ["", "1", "1", "a"]); 4084 testSingleFieldOperator!ModeOperator(col3File, 2, "mode", ["", "a", "a", "a"]); 4085 4086 auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]]; 4087 testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "", "a", "a", "a", "a", "c", "b", "b"], 4088 new MissingFieldPolicy(true, "")); // Exclude missing 4089 4090 4091 testSingleFieldOperator!ModeOperator(col1misFile, 0, "mode", ["", "X", "X", "X", "X", "X", "X", "X", "b"], 4092 new MissingFieldPolicy(false, "X")); // Replace missing 4093 } 4094 4095 /** ModeCountOperator outputs the count of the most frequent value seen. 4096 * 4097 * All the field values are stored in memory as part of this calculation. 4098 * 4099 */ 4100 final class ModeCountOperator : SingleFieldOperator 4101 { 4102 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 4103 { 4104 super("mode_count", fieldIndex, missingPolicy); 4105 } 4106 4107 final override SingleFieldCalculator makeCalculator() 4108 { 4109 return new ModeCountCalculator(fieldIndex); 4110 } 4111 4112 final class ModeCountCalculator : SingleFieldCalculator 4113 { 4114 private size_t[string] _valueCounts; 4115 4116 this(size_t fieldIndex) 4117 { 4118 super(fieldIndex); 4119 } 4120 4121 final override ModeCountOperator getOperator() 4122 { 4123 return this.outer; 4124 } 4125 4126 final override void processNextField(const char[] nextField) 4127 { 4128 auto countPtr = (nextField in _valueCounts); 4129 4130 if (countPtr is null) 4131 { 4132 string value = nextField.to!string; 4133 _valueCounts[value] = 1; 4134 } 4135 else 4136 { 4137 (*countPtr)++; 4138 } 4139 } 4140 4141 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 4142 { 4143 size_t modeCount = 0; 4144 foreach (count; _valueCounts.byValue) if (count > modeCount) modeCount = count; 4145 return printOptions.formatNumber(modeCount); 4146 } 4147 } 4148 } 4149 4150 unittest // ModeCountOperator 4151 { 4152 auto col1File = [["a"], ["b"], ["c"], ["c"], ["b"], ["b"], ["a"]]; 4153 auto col2File = [["abc", ""], ["def", ""], ["def", "xyz"]]; 4154 auto col3File = [["1.0", "1", "a"], ["2.0", "a", "1"], ["2", "a", "1.0"]]; 4155 4156 testSingleFieldOperator!ModeCountOperator(col1File, 0, "mode_count", ["0", "1", "1", "1", "2", "2", "3", "3"]); 4157 testSingleFieldOperator!ModeCountOperator(col2File, 0, "mode_count", ["0", "1", "1", "2"]); 4158 testSingleFieldOperator!ModeCountOperator(col2File, 1, "mode_count", ["0", "1", "2", "2"]); 4159 testSingleFieldOperator!ModeCountOperator(col3File, 0, "mode_count", ["0", "1", "1", "1"]); 4160 testSingleFieldOperator!ModeCountOperator(col3File, 1, "mode_count", ["0", "1", "1", "2"]); 4161 testSingleFieldOperator!ModeCountOperator(col3File, 2, "mode_count", ["0", "1", "1", "1"]); 4162 4163 auto col1misFile = [[""], ["a"], [""], ["b"], ["c"], ["c"], ["b"], ["b"]]; 4164 testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "0", "1", "1", "1", "1", "2", "2", "3"], 4165 new MissingFieldPolicy(true, "")); // Exclude missing 4166 4167 4168 testSingleFieldOperator!ModeCountOperator(col1misFile, 0, "mode_count", ["0", "1", "1", "2", "2", "2", "2", "2", "3"], 4169 new MissingFieldPolicy(false, "X")); // Replace missing 4170 } 4171 4172 /** ValuesOperator outputs each value delimited by an alternate delimiter character. 4173 * 4174 * All the field values are stored in memory as part of this calculation. This is 4175 * handled by unique key value lists. 4176 */ 4177 4178 final class ValuesOperator : SingleFieldOperator 4179 { 4180 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 4181 { 4182 super("values", fieldIndex, missingPolicy); 4183 setSaveFieldValuesText(); 4184 } 4185 4186 final override SingleFieldCalculator makeCalculator() 4187 { 4188 return new ValuesCalculator(fieldIndex); 4189 } 4190 4191 final class ValuesCalculator : SingleFieldCalculator 4192 { 4193 this(size_t fieldIndex) 4194 { 4195 super(fieldIndex); 4196 } 4197 4198 final override ValuesOperator getOperator() 4199 { 4200 return this.outer; 4201 } 4202 4203 /* Work is done by saving the field values. */ 4204 final override void processNextField(const char[] nextField) 4205 { } 4206 4207 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 4208 { 4209 return valuesLists.textValues(fieldIndex).join(printOptions.valuesDelimiter); 4210 } 4211 } 4212 } 4213 4214 unittest // ValuesOperator 4215 { 4216 auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]]; 4217 auto col2File = [["", "50"], ["", "51"], ["xyz", "52"]]; 4218 auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "---"]]; 4219 4220 testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e|", "a||b|cd|e||a"]); 4221 testSingleFieldOperator!ValuesOperator(col2File, 0, "values", ["", "", "|", "||xyz"]); 4222 testSingleFieldOperator!ValuesOperator(col2File, 1, "values", ["", "50", "50|51", "50|51|52"]); 4223 testSingleFieldOperator!ValuesOperator(col3File, 0, "values", ["", "z", "z|y", "z|y|w"]); 4224 testSingleFieldOperator!ValuesOperator(col3File, 1, "values", ["", "a", "a|ab", "a|ab|ba"]); 4225 testSingleFieldOperator!ValuesOperator(col3File, 2, "values", ["", "-", "-|--", "-|--|---"]); 4226 4227 testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e|a"], 4228 new MissingFieldPolicy(true, "")); // Exclude missing 4229 4230 4231 testSingleFieldOperator!ValuesOperator(col1File, 0, "values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e|X", "a|X|b|cd|e|X|a"], 4232 new MissingFieldPolicy(false, "X")); // Replace missing 4233 } 4234 4235 /** UniqueValuesOperator outputs each unique value delimited by an alternate delimiter 4236 * character. Values are output in the order seen. 4237 * 4238 * All unique field values are stored in memory as part of this calculation. 4239 * 4240 */ 4241 final class UniqueValuesOperator : SingleFieldOperator 4242 { 4243 this(size_t fieldIndex, MissingFieldPolicy missingPolicy) 4244 { 4245 super("unique_values", fieldIndex, missingPolicy); 4246 } 4247 4248 final override SingleFieldCalculator makeCalculator() 4249 { 4250 return new UniqueValuesCalculator(fieldIndex); 4251 } 4252 4253 final class UniqueValuesCalculator : SingleFieldCalculator 4254 { 4255 private size_t[string] _valuesHash; 4256 private Appender!(string[]) _uniqueValues; 4257 4258 this(size_t fieldIndex) 4259 { 4260 super(fieldIndex); 4261 } 4262 4263 final override UniqueValuesOperator getOperator() 4264 { 4265 return this.outer; 4266 } 4267 4268 final override void processNextField(const char[] nextField) 4269 { 4270 auto ptr = (nextField in _valuesHash); 4271 4272 if (ptr is null) 4273 { 4274 string value = nextField.to!string; 4275 _uniqueValues.put(value); 4276 _valuesHash[value] = 1; 4277 } 4278 } 4279 4280 final string calculate(UniqueKeyValuesLists valuesLists, const ref SummarizerPrintOptions printOptions) 4281 { 4282 return _uniqueValues.data.join(printOptions.valuesDelimiter); 4283 } 4284 } 4285 } 4286 4287 unittest // UniqueValuesOperator 4288 { 4289 auto col1File = [["a"], [""], ["b"], ["cd"], ["e"], [""], ["a"]]; 4290 auto col2File = [["", "50"], ["", "50"], ["xyz", "52"]]; 4291 auto col3File = [["z", "a", "-"], ["y", "ab", "--"], ["w", "ba", "-"]]; 4292 4293 testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|", "a||b", "a||b|cd", "a||b|cd|e", "a||b|cd|e", "a||b|cd|e"]); 4294 testSingleFieldOperator!UniqueValuesOperator(col2File, 0, "unique_values", ["", "", "", "|xyz"]); 4295 testSingleFieldOperator!UniqueValuesOperator(col2File, 1, "unique_values", ["", "50", "50", "50|52"]); 4296 testSingleFieldOperator!UniqueValuesOperator(col3File, 0, "unique_values", ["", "z", "z|y", "z|y|w"]); 4297 testSingleFieldOperator!UniqueValuesOperator(col3File, 1, "unique_values", ["", "a", "a|ab", "a|ab|ba"]); 4298 testSingleFieldOperator!UniqueValuesOperator(col3File, 2, "unique_values", ["", "-", "-|--", "-|--"]); 4299 4300 testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a", "a|b", "a|b|cd", "a|b|cd|e", "a|b|cd|e", "a|b|cd|e"], 4301 new MissingFieldPolicy(true, "")); // Exclude missing 4302 4303 4304 testSingleFieldOperator!UniqueValuesOperator(col1File, 0, "unique_values", ["", "a", "a|X", "a|X|b", "a|X|b|cd", "a|X|b|cd|e", "a|X|b|cd|e", "a|X|b|cd|e"], 4305 new MissingFieldPolicy(false, "X")); // Replace missing 4306 }