1 /** 2 Command line tool implementing weighted reservoir sampling on delimited data files. 3 Weights are read from a field in the file. 4 5 Copyright (c) 2017-2018, eBay Software Foundation 6 Initially written by Jon Degenhardt 7 8 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 9 */ 10 module tsv_sample; 11 12 import std.range; 13 import std.stdio; 14 import std.typecons : tuple, Flag; 15 16 version(unittest) 17 { 18 // When running unit tests, use main from -main compiler switch. 19 } 20 else 21 { 22 int main(string[] cmdArgs) 23 { 24 /* When running in DMD code coverage mode, turn on report merging. */ 25 version(D_Coverage) version(DigitalMars) 26 { 27 import core.runtime : dmd_coverSetMerge; 28 dmd_coverSetMerge(true); 29 } 30 31 TsvSampleOptions cmdopt; 32 auto r = cmdopt.processArgs(cmdArgs); 33 if (!r[0]) return r[1]; 34 version(LDC_Profile) 35 { 36 import ldc.profile : resetAll; 37 resetAll(); 38 } 39 try 40 { 41 import tsvutil : BufferedOutputRange; 42 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 43 44 if (cmdopt.useStreamSampling) 45 { 46 streamSampling(cmdopt, bufferedOutput); 47 } 48 else if (cmdopt.useDistinctSampling) 49 { 50 distinctSampling(cmdopt, bufferedOutput); 51 } 52 else if (cmdopt.sampleSize == 0) 53 { 54 reservoirSampling!(Yes.permuteAll)(cmdopt, bufferedOutput); 55 } 56 else 57 { 58 reservoirSampling!(No.permuteAll)(cmdopt, bufferedOutput); 59 } 60 } 61 catch (Exception exc) 62 { 63 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 64 return 1; 65 } 66 return 0; 67 } 68 } 69 70 auto helpText = q"EOS 71 Synopsis: tsv-sample [options] [file...] 72 73 Samples or randomizes input lines. There are several modes of operation: 74 * Randomization (Default): Input lines are output in random order. 75 * Stream sampling (--r|rate): Input lines are sampled based on a sampling 76 rate. The order of the input is unchanged. 77 * Distinct sampling (--k|key-fields, --r|rate): Sampling is based on the 78 values in the key field. A portion of the keys are chosen based on the 79 sampling rate (a distinct set). All lines with one of the selected keys 80 are output. Input order is unchanged. 81 * Weighted sampling (--w|weight-field): Input lines are selected using 82 weighted random sampling, with the weight taken from a field. Input 83 lines are output in the order selected, reordering the lines. 84 85 The '--n|num' option limits the sample sized produced. It speeds up the 86 randomization and weighted sampling cases significantly. 87 88 Use '--help-verbose' for detailed information. 89 90 Options: 91 EOS"; 92 93 auto helpTextVerbose = q"EOS 94 Synopsis: tsv-sample [options] [file...] 95 96 Samples or randomizes input lines. There are several modes of operation: 97 * Randomization (Default): Input lines are output in random order. 98 * Stream sampling (--r|rate): Input lines are sampled based on a sampling 99 rate. The order of the input is unchanged. 100 * Distinct sampling (--k|key-fields, --r|rate): Sampling is based on the 101 values in the key field. A portion of the keys are chosen based on the 102 sampling rate (a distinct set). All lines with one of the selected keys 103 are output. Input order is unchanged. 104 * Weighted sampling (--w|weight-field): Input lines are selected using 105 weighted random sampling, with the weight taken from a field. Input 106 lines are output in the order selected, reordering the lines. See 107 'Weighted sampling' below for info on field weights. 108 109 Sample size: The '--n|num' option limits the sample sized produced. This 110 speeds up randomization and weighted sampling significantly (details below). 111 112 Controlling randomization: Each run produces a different randomization. 113 Using '--s|static-seed' changes this so multiple runs produce the same 114 randomization. This works by using the same random seed each run. The 115 random seed can be specified using '--v|seed-value'. This takes a 116 non-zero, 32-bit positive integer. (A zero value is a no-op and ignored.) 117 118 Generating random weights: The random weight assigned to each line can 119 output using the '--p|print-random' option. This can be used with 120 '--rate 1' to assign a random weight to each line. The random weight 121 is prepended line as field one (separated by TAB or --d|delimiter char). 122 Weights are in the interval [0,1]. The open/closed aspects of the 123 interval (including/excluding 0.0 and 1.0) are subject to change and 124 should not be relied on. 125 126 Reservoir sampling: The randomization and weighted sampling cases are 127 implemented using reservoir sampling. This means all lines output must be 128 held in memory. Memory needed for large input streams can reduced 129 significantly using a sample size. Both 'tsv-sample -n 1000' and 130 'tsv-sample | head -n 1000' produce the same results, but the former is 131 quite a bit faster. 132 133 Weighted sampling: Weighted random sampling is done using an algorithm 134 described by Efraimidis and Spirakis. Weights should be positive values 135 representing the relative weight of the entry in the collection. Counts 136 and similar can be used as weights, it is *not* necessary to normalize to 137 a [0,1] interval. Negative values are not meaningful and given the value 138 zero. Input order is not retained, instead lines are output ordered by 139 the randomized weight that was assigned. This means that a smaller valid 140 sample can be produced by taking the first N lines of output. For more 141 info on the sampling approach see: 142 * Wikipedia: https://en.wikipedia.org/wiki/Reservoir_sampling 143 * "Weighted Random Sampling over Data Streams", Pavlos S. Efraimidis 144 (https://arxiv.org/abs/1012.0256) 145 146 Options: 147 EOS"; 148 149 struct TsvSampleOptions 150 { 151 string programName; 152 string[] files; 153 bool helpVerbose = false; // --help-verbose 154 double sampleRate = double.nan; // --r|rate - Sampling rate 155 size_t sampleSize = 0; // --n|num - Size of the desired sample 156 size_t weightField = 0; // --w|weight-field - Field holding the weight 157 size_t[] keyFields; // --k|key-fields - Used with sampling rate 158 bool hasHeader = false; // --H|header 159 bool printRandom = false; // --p|print-random 160 bool staticSeed = false; // --s|static-seed 161 uint seedValueOptionArg = 0; // --v|seed-value 162 char delim = '\t'; // --d|delimiter 163 bool versionWanted = false; // --V|version 164 bool hasWeightField = false; // Derived. 165 bool useStreamSampling = false; // Derived. 166 bool useDistinctSampling = false; // Derived. 167 uint seed = 0; // Derived from --static-seed, --seed-value 168 169 auto processArgs(ref string[] cmdArgs) 170 { 171 import std.getopt; 172 import std.math : isNaN; 173 import std.path : baseName, stripExtension; 174 import std.typecons : Yes, No; 175 import tsvutil : makeFieldListOptionHandler; 176 177 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 178 179 try 180 { 181 arraySep = ","; // Use comma to separate values in command line options 182 auto r = getopt( 183 cmdArgs, 184 "help-verbose", " Print more detailed help.", &helpVerbose, 185 std.getopt.config.caseSensitive, 186 "H|header", " Treat the first line of each file as a header.", &hasHeader, 187 std.getopt.config.caseInsensitive, 188 "r|rate", "NUM Sampling rating (0.0 < NUM <= 1.0). This sampling mode outputs a random fraction of lines, in the input order.", &sampleRate, 189 "n|num", "NUM Number of lines to output. All lines are output if not provided or zero.", &sampleSize, 190 "w|weight-field", "NUM Field containing weights. All lines get equal weight if not provided or zero.", &weightField, 191 192 "k|key-fields", "<field-list> Fields to use as key for distinct sampling. Use with --r|rate.", 193 keyFields.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex), 194 195 "p|print-random", " Output the random values that were assigned.", &printRandom, 196 "s|static-seed", " Use the same random seed every run.", &staticSeed, 197 198 std.getopt.config.caseSensitive, 199 "v|seed-value", "NUM Sets the initial random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg, 200 std.getopt.config.caseInsensitive, 201 202 "d|delimiter", "CHR Field delimiter.", &delim, 203 204 std.getopt.config.caseSensitive, 205 "V|version", " Print version information and exit.", &versionWanted, 206 std.getopt.config.caseInsensitive, 207 ); 208 209 if (r.helpWanted) 210 { 211 defaultGetoptPrinter(helpText, r.options); 212 return tuple(false, 0); 213 } 214 else if (helpVerbose) 215 { 216 defaultGetoptPrinter(helpTextVerbose, r.options); 217 return tuple(false, 0); 218 } 219 else if (versionWanted) 220 { 221 import tsvutils_version; 222 writeln(tsvutilsVersionNotice("tsv-sample")); 223 return tuple(false, 0); 224 } 225 226 /* Derivations and validations. */ 227 if (weightField > 0) 228 { 229 hasWeightField = true; 230 weightField--; // Switch to zero-based indexes. 231 } 232 233 if (keyFields.length > 0 && sampleRate.isNaN) 234 { 235 throw new Exception("--r|rate is required when using --k|key-fields."); 236 } 237 238 /* Sample rate (--r|rate) is used for both stream sampling and distinct sampling. */ 239 if (!sampleRate.isNaN) 240 { 241 if (sampleRate <= 0.0 || sampleRate > 1.0) 242 { 243 import std.format : format; 244 throw new Exception( 245 format("Invalid --r|rate option: %g. Must satisfy 0.0 < rate <= 1.0.", sampleRate)); 246 } 247 248 if (hasWeightField) throw new Exception("--w|weight-field and --r|rate cannot be used together."); 249 250 if (keyFields.length > 0) useDistinctSampling = true; 251 else useStreamSampling = true; 252 } 253 254 /* Seed. */ 255 import std.random : unpredictableSeed; 256 seed = (seedValueOptionArg != 0) ? seedValueOptionArg 257 : staticSeed ? 2438424139 258 : unpredictableSeed; 259 260 /* Assume remaining args are files. Use standard input if files were not provided. */ 261 files ~= (cmdArgs.length > 1) ? cmdArgs[1..$] : ["-"]; 262 cmdArgs.length = 1; 263 } 264 catch (Exception exc) 265 { 266 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 267 return tuple(false, 1); 268 } 269 return tuple(true, 0); 270 } 271 } 272 273 /* streamSampling does simple bernoulli sampling on the input stream. Each input line 274 * is a assigned a random value and output if less than the sampling rate. 275 * 276 * Note: Performance tests show that skip sampling is faster when the sampling rate 277 * is approximately 4-5% or less. An optimization would be to have separate function 278 * to use when the sampling rate is small and the random weights are not being added 279 * to each line. 280 */ 281 void streamSampling(OutputRange)(TsvSampleOptions cmdopt, OutputRange outputStream) 282 if (isOutputRange!(OutputRange, char)) 283 { 284 import std.random : Random, uniform01; 285 import tsvutil : throwIfWindowsNewlineOnUnix; 286 287 auto randomGenerator = Random(cmdopt.seed); 288 289 /* Process each line. */ 290 bool headerWritten = false; 291 size_t numLinesWritten = 0; 292 foreach (filename; cmdopt.files) 293 { 294 auto inputStream = (filename == "-") ? stdin : filename.File(); 295 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 296 { 297 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 298 if (fileLineNum == 1 && cmdopt.hasHeader) 299 { 300 if (!headerWritten) 301 { 302 if (cmdopt.printRandom) 303 { 304 outputStream.put("random_weight"); 305 outputStream.put(cmdopt.delim); 306 } 307 outputStream.put(line); 308 outputStream.put("\n"); 309 headerWritten = true; 310 } 311 } 312 else 313 { 314 double lineScore = uniform01(randomGenerator); 315 if (lineScore < cmdopt.sampleRate) 316 { 317 if (cmdopt.printRandom) 318 { 319 import std.format; 320 outputStream.put(format("%.15g", lineScore)); 321 outputStream.put(cmdopt.delim); 322 } 323 outputStream.put(line); 324 outputStream.put("\n"); 325 326 if (cmdopt.sampleSize != 0) 327 { 328 ++numLinesWritten; 329 if (numLinesWritten == cmdopt.sampleSize) return; 330 } 331 } 332 } 333 } 334 } 335 } 336 337 /* distinctSampling samples a portion of the unique values from the key fields. This 338 * is done by hashing the key and mapping the hash value into buckets matching the 339 * sampling rate size. Records having a key mapping to bucket zero are output. 340 */ 341 void distinctSampling(OutputRange)(TsvSampleOptions cmdopt, OutputRange outputStream) 342 if (isOutputRange!(OutputRange, char)) 343 { 344 import std.algorithm : splitter; 345 import std.conv : to; 346 import std.digest.murmurhash; 347 import std.math : lrint; 348 import tsvutil : InputFieldReordering, throwIfWindowsNewlineOnUnix; 349 350 assert(cmdopt.keyFields.length > 0); 351 assert(0.0 < cmdopt.sampleRate && cmdopt.sampleRate <= 1.0); 352 353 immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys. 354 355 uint numBuckets = (1.0 / cmdopt.sampleRate).lrint.to!uint; 356 357 /* Create a mapping for the key fields. */ 358 auto keyFieldsReordering = new InputFieldReordering!char(cmdopt.keyFields); 359 360 /* Process each line. */ 361 bool headerWritten = false; 362 size_t numLinesWritten = 0; 363 foreach (filename; cmdopt.files) 364 { 365 auto inputStream = (filename == "-") ? stdin : filename.File(); 366 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 367 { 368 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 369 if (fileLineNum == 1 && cmdopt.hasHeader) 370 { 371 if (!headerWritten) 372 { 373 outputStream.put(line); 374 outputStream.put("\n"); 375 headerWritten = true; 376 } 377 } 378 else 379 { 380 /* Gather the key field values and assemble the key. */ 381 keyFieldsReordering.initNewLine; 382 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 383 { 384 keyFieldsReordering.processNextField(fieldIndex, fieldValue); 385 if (keyFieldsReordering.allFieldsFilled) break; 386 } 387 388 if (!keyFieldsReordering.allFieldsFilled) 389 { 390 import std.format : format; 391 throw new Exception( 392 format("Not enough fields in line. File: %s, Line: %s", 393 (filename == "-") ? "Standard Input" : filename, fileLineNum)); 394 } 395 396 auto hasher = MurmurHash3!32(cmdopt.seed); 397 foreach (count, key; keyFieldsReordering.outputFields.enumerate) 398 { 399 if (count > 0) hasher.put(delimArray); 400 hasher.put(cast(ubyte[]) key); 401 } 402 hasher.finish; 403 if (hasher.get % numBuckets == 0) 404 { 405 outputStream.put(line); 406 outputStream.put("\n"); 407 408 if (cmdopt.sampleSize != 0) 409 { 410 ++numLinesWritten; 411 if (numLinesWritten == cmdopt.sampleSize) return; 412 } 413 } 414 } 415 } 416 } 417 } 418 419 /* An implementation of reservior sampling. Both weighted and unweighted sampling are 420 * supported. Both are implemented using the one-pass algorithm described by Efraimidis 421 * and Spirakis ("Weighted Random Sampling over Data Streams", Pavlos S. Efraimidis, 422 * https://arxiv.org/abs/1012.0256). In the unweighted case weights are simply set to one. 423 * 424 * Both sampling and full permutation of the input are supported, but the implementations 425 * differ. Both use a heap (priority queue). A "max" heap is used when permuting all lines, 426 * as it leaves the heap in the correct order for output. However, a "min" heap is used 427 * when sampling. When sampling the case the role of the heap is to indentify the top-k 428 * elements. Adding a new items means dropping the "min" item. When done reading all lines, 429 * the "min" heap is in the opposite order needed for output. The desired order is obtained 430 * by removing each element one at at time from the heap. The underlying data store will 431 * have the elements in correct order. The other notable difference is that the backing 432 * store can be pre-allocated when sampling, but must be grown when permuting all lines. 433 */ 434 void reservoirSampling(Flag!"permuteAll" permuteAll, OutputRange) 435 (TsvSampleOptions cmdopt, OutputRange outputStream) 436 if (isOutputRange!(OutputRange, char)) 437 { 438 import std.random : Random, uniform01; 439 import std.container.binaryheap; 440 import tsvutil : throwIfWindowsNewlineOnUnix; 441 442 /* Ensure the correct version of the template was called. */ 443 static if (permuteAll) assert(cmdopt.sampleSize == 0); 444 else assert(cmdopt.sampleSize > 0); 445 446 auto randomGenerator = Random(cmdopt.seed); 447 448 struct Entry 449 { 450 double score; 451 char[] line; 452 } 453 454 /* Create the heap and backing data store. A min or max heap is used as described 455 * above. The backing store has some complications resulting from the current 456 * standard library implementation: 457 * - Built-in arrays appear to have better memory bevavior when appending than 458 * std.container.array Arrays. However, built-in arrays cannot be used with 459 * binaryheaps until Phobos version 2.072. 460 * - std.container.array Arrays with pre-allocated storage can be used to 461 * efficiently reverse the heap, but a bug prevents this from working for other 462 * data store use cases. Info: https://issues.dlang.org/show_bug.cgi?id=17094 463 * - Result: Use a built-in array if request is for permuteAll and Phobos version 464 * is 2.072 or later. Otherwise use a std.container.array Array. 465 */ 466 467 static if (permuteAll && __VERSION__ >= 2072) 468 { 469 Entry[] dataStore; 470 } 471 else 472 { 473 import std.container.array; 474 Array!Entry dataStore; 475 } 476 477 dataStore.reserve(cmdopt.sampleSize); 478 479 static if (permuteAll) 480 { 481 auto reservoir = dataStore.heapify!("a.score < b.score")(0); // Max binaryheap 482 } 483 else 484 { 485 auto reservoir = dataStore.heapify!("a.score > b.score")(0); // Min binaryheap 486 } 487 488 /* Process each line. */ 489 bool headerWritten = false; 490 foreach (filename; cmdopt.files) 491 { 492 auto inputStream = (filename == "-") ? stdin : filename.File(); 493 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 494 { 495 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 496 if (fileLineNum == 1 && cmdopt.hasHeader) 497 { 498 if (!headerWritten) 499 { 500 if (cmdopt.printRandom) 501 { 502 outputStream.put("random_weight"); 503 outputStream.put(cmdopt.delim); 504 } 505 outputStream.put(line); 506 outputStream.put("\n"); 507 headerWritten = true; 508 } 509 } 510 else 511 { 512 double lineWeight = 513 cmdopt.hasWeightField 514 ? getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, filename, fileLineNum) 515 : 1.0; 516 double lineScore = 517 (lineWeight > 0.0) 518 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 519 : 0.0; 520 521 static if (permuteAll) 522 { 523 reservoir.insert(Entry(lineScore, line.dup)); 524 } 525 else 526 { 527 if (reservoir.length < cmdopt.sampleSize) 528 { 529 reservoir.insert(Entry(lineScore, line.dup)); 530 } 531 else if (reservoir.front.score < lineScore) 532 { 533 reservoir.replaceFront(Entry(lineScore, line.dup)); 534 } 535 } 536 } 537 } 538 } 539 540 /* All entries are in the reservoir. Time to print. Entries are printed ordered 541 * by assigned weights. In the sampling/top-k cases this could sped up a little 542 * by simply printing the backing store array. However, there is real value in 543 * having a weighted order. This is especially true for weighted sampling, but 544 * there is also value in the unweighted case, especially when using static seeds. 545 */ 546 547 void printEntry(Entry entry) 548 { 549 if (cmdopt.printRandom) 550 { 551 import std.format; 552 outputStream.put(format("%.15g", entry.score)); 553 outputStream.put(cmdopt.delim); 554 } 555 outputStream.put(entry.line); 556 outputStream.put("\n"); 557 } 558 559 static if (permuteAll) 560 { 561 foreach (entry; reservoir) printEntry(entry); // Walk the max-heap 562 } 563 else 564 { 565 /* Sampling/top-n case: Reorder the data store by extracting all the elements. 566 * Note: Asserts are chosen to avoid issues in the current binaryheap implementation. 567 */ 568 size_t numLines = reservoir.length; 569 assert(numLines == dataStore.length); 570 571 while (!reservoir.empty) reservoir.removeFront; 572 assert(numLines == dataStore.length); 573 foreach (entry; dataStore) printEntry(entry); 574 } 575 } 576 577 /* A convenience function for extracting a single field from a line. See getTsvFieldValue in 578 * common/src/tsvutils.d for details. This wrapper creates error text tailored for this program. 579 */ 580 import std.traits : isSomeChar; 581 T getFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim, string filename, size_t lineNum) 582 pure @safe 583 if (isSomeChar!C) 584 { 585 import std.conv : ConvException, to; 586 import std.format : format; 587 import tsvutil : getTsvFieldValue; 588 589 T val; 590 try 591 { 592 val = getTsvFieldValue!T(line, fieldIndex, delim); 593 } 594 catch (ConvException exc) 595 { 596 throw new Exception( 597 format("Could not process line: %s\n File: %s Line: %s%s", 598 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum, 599 (lineNum == 1) ? "\n Is this a header line? Use --H|header to skip." : "")); 600 } 601 catch (Exception exc) 602 { 603 /* Not enough fields on the line. */ 604 throw new Exception( 605 format("Could not process line: %s\n File: %s Line: %s", 606 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum)); 607 } 608 609 return val; 610 } 611 612 unittest 613 { 614 /* getFieldValue unit tests. getTsvFieldValue has it's own tests. 615 * These tests make basic sanity checks on the getFieldValue wrapper. 616 */ 617 import std.exception; 618 619 assert(getFieldValue!double("123", 0, '\t', "unittest", 1) == 123); 620 assert(getFieldValue!double("123.4", 0, '\t', "unittest", 1) == 123.4); 621 assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 1)); 622 assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 2)); 623 assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 1)); 624 assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 2)); 625 } 626 627 /* Unit tests for the main program start here. 628 * 629 * Portability note: Many of the tests here rely on generating consistent random numbers 630 * across different platforms when using the same random seed. So far this has succeeded 631 * on several different platorm, compiler, and library versions. However, it is certainly 632 * possible this condition will not hold on other platforms. 633 * 634 * For tsv-sample, this portability implies generating the same results on different 635 * platforms when using the same random seed. This is NOT part of tsv-sample guarantees, 636 * but it is convenient for testing. If platforms are identified that do not generate 637 * the same results these tests will need to be adjusted. 638 */ 639 version(unittest) 640 { 641 /* Unit test helper functions. */ 642 643 import unittest_utils; // tsv unit test helpers, from common/src/. 644 import std.conv : to; 645 646 void testTsvSample(string[] cmdArgs, string[][] expected) 647 { 648 import std.array : appender; 649 import std.format : format; 650 651 assert(cmdArgs.length > 0, "[testTsvSample] cmdArgs must not be empty."); 652 653 auto formatAssertMessage(T...)(string msg, T formatArgs) 654 { 655 auto formatString = "[testTsvSample] %s: " ~ msg; 656 return format(formatString, cmdArgs[0], formatArgs); 657 } 658 659 TsvSampleOptions cmdopt; 660 auto savedCmdArgs = cmdArgs.to!string; 661 auto r = cmdopt.processArgs(cmdArgs); 662 assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); 663 auto output = appender!(char[])(); 664 665 if (cmdopt.useDistinctSampling) 666 { 667 distinctSampling(cmdopt, output); 668 } 669 else if (cmdopt.useStreamSampling) 670 { 671 streamSampling(cmdopt, output); 672 } 673 else if (cmdopt.sampleSize == 0) 674 { 675 reservoirSampling!(Yes.permuteAll)(cmdopt, output); 676 } 677 else 678 { 679 reservoirSampling!(No.permuteAll)(cmdopt, output); 680 } 681 682 auto expectedOutput = expected.tsvDataToString; 683 684 assert(output.data == expectedOutput, 685 formatAssertMessage( 686 "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================", 687 expectedOutput.to!string, output.data.to!string)); 688 } 689 } 690 691 unittest 692 { 693 import std.path : buildPath; 694 import std.file : rmdirRecurse; 695 import std.format : format; 696 697 auto testDir = makeUnittestTempDir("tsv_sample"); 698 scope(exit) testDir.rmdirRecurse; 699 700 /* Tabular data sets and expected results use the built-in static seed. 701 * Tests are run by writing the data set to a file, then calling the main 702 * routine to process. The function testTsvSample plays the role of the 703 * main program. Rather than writing to expected output, the results are 704 * matched against expected. The expected results were verified by hand 705 * prior to inclusion in the test. 706 * 707 * The initial part of this section is simply setting up data files and 708 * expected results. 709 */ 710 711 /* Empty file. */ 712 string[][] dataEmpty = []; 713 string fpath_dataEmpty = buildPath(testDir, "dataEmpty.tsv"); 714 writeUnittestTsvFile(fpath_dataEmpty, dataEmpty); 715 716 /* 3x1, header only. */ 717 string[][] data3x0 = [["field_a", "field_b", "field_c"]]; 718 string fpath_data3x0 = buildPath(testDir, "data3x0.tsv"); 719 writeUnittestTsvFile(fpath_data3x0, data3x0); 720 721 /* 3x1 */ 722 string[][] data3x1 = 723 [["field_a", "field_b", "field_c"], 724 ["tan", "タン", "8.5"]]; 725 726 string fpath_data3x1 = buildPath(testDir, "data3x1.tsv"); 727 string fpath_data3x1_noheader = buildPath(testDir, "data3x1_noheader.tsv"); 728 writeUnittestTsvFile(fpath_data3x1, data3x1); 729 writeUnittestTsvFile(fpath_data3x1_noheader, data3x1[1..$]); 730 731 string[][] data3x2 = 732 [["field_a", "field_b", "field_c"], 733 ["brown", "褐色", "29.2"], 734 ["gray", "グレー", "6.2"]]; 735 736 /* 3x2 */ 737 string fpath_data3x2 = buildPath(testDir, "data3x2.tsv"); 738 string fpath_data3x2_noheader = buildPath(testDir, "data3x2_noheader.tsv"); 739 writeUnittestTsvFile(fpath_data3x2, data3x2); 740 writeUnittestTsvFile(fpath_data3x2_noheader, data3x2[1..$]); 741 742 string[][] data3x2ExpectedNoWt = 743 [["field_a", "field_b", "field_c"], 744 ["gray", "グレー", "6.2"], 745 ["brown", "褐色", "29.2"]]; 746 747 /* 3x3 */ 748 string[][] data3x3 = 749 [["field_a", "field_b", "field_c"], 750 ["orange", "オレンジ", "2.5"], 751 ["pink", "ピンク", "1.1"], 752 ["purple", "紫の", "42"]]; 753 754 string fpath_data3x3 = buildPath(testDir, "data3x3.tsv"); 755 string fpath_data3x3_noheader = buildPath(testDir, "data3x3_noheader.tsv"); 756 writeUnittestTsvFile(fpath_data3x3, data3x3); 757 writeUnittestTsvFile(fpath_data3x3_noheader, data3x3[1..$]); 758 759 string[][] data3x3ExpectedNoWt = 760 [["field_a", "field_b", "field_c"], 761 ["purple", "紫の", "42"], 762 ["pink", "ピンク", "1.1"], 763 ["orange", "オレンジ", "2.5"]]; 764 765 /* 3x6 */ 766 string[][] data3x6 = 767 [["field_a", "field_b", "field_c"], 768 ["red", "赤", "23.8"], 769 ["green", "緑", "0.0072"], 770 ["white", "白", "1.65"], 771 ["yellow", "黄", "12"], 772 ["blue", "青", "12"], 773 ["black", "黒", "0.983"]]; 774 string fpath_data3x6 = buildPath(testDir, "data3x6.tsv"); 775 string fpath_data3x6_noheader = buildPath(testDir, "data3x6_noheader.tsv"); 776 writeUnittestTsvFile(fpath_data3x6, data3x6); 777 writeUnittestTsvFile(fpath_data3x6_noheader, data3x6[1..$]); 778 779 string[][] data3x6ExpectedNoWt = 780 [["field_a", "field_b", "field_c"], 781 ["yellow", "黄", "12"], 782 ["black", "黒", "0.983"], 783 ["blue", "青", "12"], 784 ["white", "白", "1.65"], 785 ["green", "緑", "0.0072"], 786 ["red", "赤", "23.8"]]; 787 788 string[][] data3x6ExpectedNoWtProbs = 789 [["random_weight", "field_a", "field_b", "field_c"], 790 ["0.960555462865159", "yellow", "黄", "12"], 791 ["0.757101539289579", "black", "黒", "0.983"], 792 ["0.525259808870032", "blue", "青", "12"], 793 ["0.492878549499437", "white", "白", "1.65"], 794 ["0.159293440869078", "green", "緑", "0.0072"], 795 ["0.010968807619065", "red", "赤", "23.8"]]; 796 797 string[][] data3x6ExpectedProbsStreamSampleP100 = 798 [["random_weight", "field_a", "field_b", "field_c"], 799 ["0.010968807619065", "red", "赤", "23.8"], 800 ["0.159293440869078", "green", "緑", "0.0072"], 801 ["0.492878549499437", "white", "白", "1.65"], 802 ["0.960555462865159", "yellow", "黄", "12"], 803 ["0.525259808870032", "blue", "青", "12"], 804 ["0.757101539289579", "black", "黒", "0.983"]]; 805 806 string[][] data3x6ExpectedProbsStreamSampleP60 = 807 [["random_weight", "field_a", "field_b", "field_c"], 808 ["0.010968807619065", "red", "赤", "23.8"], 809 ["0.159293440869078", "green", "緑", "0.0072"], 810 ["0.492878549499437", "white", "白", "1.65"], 811 ["0.525259808870032", "blue", "青", "12"]]; 812 813 string[][] data3x6ExpectedStreamSampleP60 = 814 [["field_a", "field_b", "field_c"], 815 ["red", "赤", "23.8"], 816 ["green", "緑", "0.0072"], 817 ["white", "白", "1.65"], 818 ["blue", "青", "12"]]; 819 820 string[][] data3x6ExpectedDistinctSampleK1K3P60 = 821 [["field_a", "field_b", "field_c"], 822 ["green", "緑", "0.0072"], 823 ["white", "白", "1.65"], 824 ["blue", "青", "12"]]; 825 826 string[][] data3x6ExpectedWt3Probs = 827 [["random_weight", "field_a", "field_b", "field_c"], 828 ["0.996651987576454", "yellow", "黄", "12"], 829 ["0.947758848098367", "blue", "青", "12"], 830 ["0.827282346822867", "red", "赤", "23.8"], 831 ["0.75346697377182", "black", "黒", "0.983"], 832 ["0.651301034964225", "white", "白", "1.65"], 833 ["1.56369437128799e-111", "green", "緑", "0.0072"]]; 834 835 string[][] data3x6ExpectedWt3 = 836 [["field_a", "field_b", "field_c"], 837 ["yellow", "黄", "12"], 838 ["blue", "青", "12"], 839 ["red", "赤", "23.8"], 840 ["black", "黒", "0.983"], 841 ["white", "白", "1.65"], 842 ["green", "緑", "0.0072"]]; 843 844 /* Using a different static seed. */ 845 string[][] data3x6ExpectedNoWtV41Probs = 846 [["random_weight", "field_a", "field_b", "field_c"], 847 ["0.680572726530954", "green", "緑", "0.0072"], 848 ["0.676816243678331", "blue", "青", "12"], 849 ["0.32097338931635", "yellow", "黄", "12"], 850 ["0.250923618674278", "red", "赤", "23.8"], 851 ["0.155359342927113", "black", "黒", "0.983"], 852 ["0.0460958210751414", "white", "白", "1.65"]]; 853 854 string[][] data3x6ExpectedV41ProbsStreamSampleP60 = 855 [["random_weight", "field_a", "field_b", "field_c"], 856 ["0.250923618674278", "red", "赤", "23.8"], 857 ["0.0460958210751414", "white", "白", "1.65"], 858 ["0.32097338931635", "yellow", "黄", "12"], 859 ["0.155359342927113", "black", "黒", "0.983"]]; 860 861 string[][] data3x6ExpectedWt3V41Probs = 862 [["random_weight", "field_a", "field_b", "field_c"], 863 ["0.967993774989107", "blue", "青", "12"], 864 ["0.943562457925736", "red", "赤", "23.8"], 865 ["0.90964601024272", "yellow", "黄", "12"], 866 ["0.154916584092601", "white", "白", "1.65"], 867 ["0.15043620392537", "black", "黒", "0.983"], 868 ["6.13946748307015e-24", "green", "緑", "0.0072"]]; 869 870 871 /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */ 872 string[][] combo1ExpectedNoWt = 873 [["field_a", "field_b", "field_c"], 874 ["yellow", "黄", "12"], 875 ["tan", "タン", "8.5"], 876 ["brown", "褐色", "29.2"], 877 ["green", "緑", "0.0072"], 878 ["red", "赤", "23.8"], 879 ["purple", "紫の", "42"], 880 ["black", "黒", "0.983"], 881 ["white", "白", "1.65"], 882 ["gray", "グレー", "6.2"], 883 ["blue", "青", "12"], 884 ["pink", "ピンク", "1.1"], 885 ["orange", "オレンジ", "2.5"]]; 886 887 string[][] combo1ExpectedNoWtProbs = 888 [["random_weight", "field_a", "field_b", "field_c"], 889 ["0.970885202754289", "yellow", "黄", "12"], 890 ["0.960555462865159", "tan", "タン", "8.5"], 891 ["0.817568943137303", "brown", "褐色", "29.2"], 892 ["0.757101539289579", "green", "緑", "0.0072"], 893 ["0.525259808870032", "red", "赤", "23.8"], 894 ["0.492878549499437", "purple", "紫の", "42"], 895 ["0.470815070671961", "black", "黒", "0.983"], 896 ["0.383881829213351", "white", "白", "1.65"], 897 ["0.292159906122833", "gray", "グレー", "6.2"], 898 ["0.240332160145044", "blue", "青", "12"], 899 ["0.159293440869078", "pink", "ピンク", "1.1"], 900 ["0.010968807619065", "orange", "オレンジ", "2.5"]]; 901 902 string[][] combo1ExpectedProbsStreamSampleP50 = 903 [["random_weight", "field_a", "field_b", "field_c"], 904 ["0.010968807619065", "orange", "オレンジ", "2.5"], 905 ["0.159293440869078", "pink", "ピンク", "1.1"], 906 ["0.492878549499437", "purple", "紫の", "42"], 907 ["0.383881829213351", "white", "白", "1.65"], 908 ["0.240332160145044", "blue", "青", "12"], 909 ["0.470815070671961", "black", "黒", "0.983"], 910 ["0.292159906122833", "gray", "グレー", "6.2"]]; 911 912 string[][] combo1ExpectedStreamSampleP40 = 913 [["field_a", "field_b", "field_c"], 914 ["orange", "オレンジ", "2.5"], 915 ["pink", "ピンク", "1.1"], 916 ["white", "白", "1.65"], 917 ["blue", "青", "12"], 918 ["gray", "グレー", "6.2"]]; 919 920 string[][] combo1ExpectedDistinctSampleK1P40 = 921 [["field_a", "field_b", "field_c"], 922 ["orange", "オレンジ", "2.5"], 923 ["red", "赤", "23.8"], 924 ["green", "緑", "0.0072"], 925 ["blue", "青", "12"], 926 ["black", "黒", "0.983"]]; 927 928 string[][] combo1ExpectedWt3Probs = 929 [["random_weight", "field_a", "field_b", "field_c"], 930 ["0.997540775237188", "yellow", "黄", "12"], 931 ["0.995276654400888", "tan", "タン", "8.5"], 932 ["0.993125789457417", "brown", "褐色", "29.2"], 933 ["0.983296025533894", "purple", "紫の", "42"], 934 ["0.973309619380837", "red", "赤", "23.8"], 935 ["0.887975515217396", "blue", "青", "12"], 936 ["0.819992304890418", "gray", "グレー", "6.2"], 937 ["0.559755692042509", "white", "白", "1.65"], 938 ["0.464721356092057", "black", "黒", "0.983"], 939 ["0.188245827041913", "pink", "ピンク", "1.1"], 940 ["0.164461318532999", "orange", "オレンジ", "2.5"], 941 ["1.64380869310205e-17", "green", "緑", "0.0072"]]; 942 943 string[][] combo1ExpectedWt3 = 944 [["field_a", "field_b", "field_c"], 945 ["yellow", "黄", "12"], 946 ["tan", "タン", "8.5"], 947 ["brown", "褐色", "29.2"], 948 ["purple", "紫の", "42"], 949 ["red", "赤", "23.8"], 950 ["blue", "青", "12"], 951 ["gray", "グレー", "6.2"], 952 ["white", "白", "1.65"], 953 ["black", "黒", "0.983"], 954 ["pink", "ピンク", "1.1"], 955 ["orange", "オレンジ", "2.5"], 956 ["green", "緑", "0.0072"]]; 957 958 /* 1x10 - Simple 1-column file. */ 959 string[][] data1x10 = 960 [["field_a"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"], ["10"]]; 961 string fpath_data1x10 = buildPath(testDir, "data1x10.tsv"); 962 string fpath_data1x10_noheader = buildPath(testDir, "data1x10_noheader.tsv"); 963 writeUnittestTsvFile(fpath_data1x10, data1x10); 964 writeUnittestTsvFile(fpath_data1x10_noheader, data1x10[1..$]); 965 966 string[][] data1x10ExpectedNoWt = 967 [["field_a"], ["8"], ["4"], ["6"], ["5"], ["3"], ["10"], ["7"], ["9"], ["2"], ["1"]]; 968 969 string[][] data1x10ExpectedWt1 = 970 [["field_a"], ["8"], ["4"], ["6"], ["10"], ["5"], ["7"], ["9"], ["3"], ["2"], ["1"]]; 971 972 /* 2x10a - Uniform distribution [0,1]. */ 973 string[][] data2x10a = 974 [["line", "weight"], 975 ["1", "0.26788837"], 976 ["2", "0.06601298"], 977 ["3", "0.38627527"], 978 ["4", "0.47379424"], 979 ["5", "0.02966641"], 980 ["6", "0.05636231"], 981 ["7", "0.70529242"], 982 ["8", "0.91836862"], 983 ["9", "0.99103720"], 984 ["10", "0.31401740"]]; 985 986 string fpath_data2x10a = buildPath(testDir, "data2x10a.tsv"); 987 writeUnittestTsvFile(fpath_data2x10a, data2x10a); 988 989 string[][] data2x10aExpectedWt2Probs = 990 [["random_weight", "line", "weight"], 991 ["0.968338654945437", "8", "0.91836862"], 992 ["0.918568420544139", "4", "0.47379424"], 993 ["0.257308320877951", "7", "0.70529242"], 994 ["0.237253179070181", "9", "0.99103720"], 995 ["0.160160967018722", "3", "0.38627527"], 996 ["0.0908196626672434", "10", "0.31401740"], 997 ["0.00717645392443612", "6", "0.05636231"], 998 ["4.83186429516301e-08", "1", "0.26788837"], 999 ["3.75256929665355e-10", "5", "0.02966641"], 1000 ["8.21232478800958e-13", "2", "0.06601298"]]; 1001 1002 /* 2x10b - Uniform distribution [0,1000]. */ 1003 string[][] data2x10b = 1004 [["line", "weight"], 1005 ["1", "761"], 1006 ["2", "432"], 1007 ["3", "103"], 1008 ["4", "448"], 1009 ["5", "750"], 1010 ["6", "711"], 1011 ["7", "867"], 1012 ["8", "841"], 1013 ["9", "963"], 1014 ["10", "784"]]; 1015 1016 string fpath_data2x10b = buildPath(testDir, "data2x10b.tsv"); 1017 writeUnittestTsvFile(fpath_data2x10b, data2x10b); 1018 1019 string[][] data2x10bExpectedWt2Probs = 1020 [["random_weight", "line", "weight"], 1021 ["0.99996486739068", "8", "841"], 1022 ["0.999910174671372", "4", "448"], 1023 ["0.999608715248737", "6", "711"], 1024 ["0.999141885371438", "5", "750"], 1025 ["0.999039632502748", "10", "784"], 1026 ["0.998896318259319", "7", "867"], 1027 ["0.998520583151911", "9", "963"], 1028 ["0.995756696791589", "2", "432"], 1029 ["0.994087587320506", "1", "761"], 1030 ["0.993154677612124", "3", "103"]]; 1031 1032 /* 2x10c - Logarithmic distribution in random order. */ 1033 string[][] data2x10c = 1034 [["line", "weight"], 1035 ["1", "31.85"], 1036 ["2", "17403.31"], 1037 ["3", "653.84"], 1038 ["4", "8.23"], 1039 ["5", "2671.04"], 1040 ["6", "26226.08"], 1041 ["7", "1.79"], 1042 ["8", "354.56"], 1043 ["9", "35213.81"], 1044 ["10", "679.29"]]; 1045 1046 string fpath_data2x10c = buildPath(testDir, "data2x10c.tsv"); 1047 writeUnittestTsvFile(fpath_data2x10c, data2x10c); 1048 1049 string[][] data2x10cExpectedWt2Probs = 1050 [["random_weight", "line", "weight"], 1051 ["0.999989390087097", "6", "26226.08"], 1052 ["0.999959512916955", "9", "35213.81"], 1053 ["0.999916669076135", "8", "354.56"], 1054 ["0.999894450521864", "2", "17403.31"], 1055 ["0.999758976028616", "5", "2671.04"], 1056 ["0.998918527698776", "3", "653.84"], 1057 ["0.998891677527825", "10", "679.29"], 1058 ["0.995122075068501", "4", "8.23"], 1059 ["0.86789371584259", "1", "31.85"], 1060 ["0.585744381629156", "7", "1.79"]]; 1061 1062 /* 2x10d. Logarithmic distribution in ascending order. */ 1063 string[][] data2x10d = 1064 [["line", "weight"], 1065 ["1", "1.79"], 1066 ["2", "8.23"], 1067 ["3", "31.85"], 1068 ["4", "354.56"], 1069 ["5", "653.84"], 1070 ["6", "679.29"], 1071 ["7", "2671.04"], 1072 ["8", "17403.31"], 1073 ["9", "26226.08"], 1074 ["10", "35213.81"]]; 1075 1076 string fpath_data2x10d = buildPath(testDir, "data2x10d.tsv"); 1077 writeUnittestTsvFile(fpath_data2x10d, data2x10d); 1078 1079 string[][] data2x10dExpectedWt2Probs = 1080 [["random_weight", "line", "weight"], 1081 ["0.999998302218464", "8", "17403.31"], 1082 ["0.999978608340414", "10", "35213.81"], 1083 ["0.999945638289867", "9", "26226.08"], 1084 ["0.999886503635757", "4", "354.56"], 1085 ["0.999641619391901", "7", "2671.04"], 1086 ["0.999590453389486", "6", "679.29"], 1087 ["0.999015744906398", "5", "653.84"], 1088 ["0.978031633047474", "3", "31.85"], 1089 ["0.799947918069109", "2", "8.23"], 1090 ["0.0803742612399491", "1", "1.79"]]; 1091 1092 /* 2x10e. Logarithmic distribution in descending order. */ 1093 string[][] data2x10e = 1094 [["line", "weight"], 1095 ["1", "35213.81"], 1096 ["2", "26226.08"], 1097 ["3", "17403.31"], 1098 ["4", "2671.04"], 1099 ["5", "679.29"], 1100 ["6", "653.84"], 1101 ["7", "354.56"], 1102 ["8", "31.85"], 1103 ["9", "8.23"], 1104 ["10", "1.79"]]; 1105 string fpath_data2x10e = buildPath(testDir, "data2x10e.tsv"); 1106 writeUnittestTsvFile(fpath_data2x10e, data2x10e); 1107 1108 string[][] data2x10eExpectedWt2Probs = 1109 [["random_weight", "line", "weight"], 1110 ["0.999984933489752", "4", "2671.04"], 1111 ["0.999959348072026", "3", "17403.31"], 1112 ["0.999929957397275", "2", "26226.08"], 1113 ["0.999871856792456", "1", "35213.81"], 1114 ["0.999574515631739", "6", "653.84"], 1115 ["0.999072736502096", "8", "31.85"], 1116 ["0.999052603129689", "5", "679.29"], 1117 ["0.997303336505164", "7", "354.56"], 1118 ["0.840939024352278", "9", "8.23"], 1119 ["0.6565001592629", "10", "1.79"]]; 1120 1121 /* Data sets for distinct sampling. */ 1122 string[][] data5x25 = 1123 [["ID", "Shape", "Color", "Size", "Weight"], 1124 ["01", "circle", "red", "S", "10"], 1125 ["02", "circle", "black", "L", "20"], 1126 ["03", "square", "black", "L", "20"], 1127 ["04", "circle", "green", "L", "30"], 1128 ["05", "ellipse", "red", "S", "20"], 1129 ["06", "triangle", "red", "S", "10"], 1130 ["07", "triangle", "red", "L", "20"], 1131 ["08", "square", "black", "S", "10"], 1132 ["09", "circle", "black", "S", "20"], 1133 ["10", "square", "green", "L", "20"], 1134 ["11", "triangle", "red", "L", "20"], 1135 ["12", "circle", "green", "L", "30"], 1136 ["13", "ellipse", "red", "S", "20"], 1137 ["14", "circle", "green", "L", "30"], 1138 ["15", "ellipse", "red", "L", "30"], 1139 ["16", "square", "red", "S", "10"], 1140 ["17", "circle", "black", "L", "20"], 1141 ["18", "square", "red", "S", "20"], 1142 ["19", "square", "black", "L", "20"], 1143 ["20", "circle", "red", "S", "10"], 1144 ["21", "ellipse", "black", "L", "30"], 1145 ["22", "triangle", "red", "L", "30"], 1146 ["23", "circle", "green", "S", "20"], 1147 ["24", "square", "green", "L", "20"], 1148 ["25", "circle", "red", "S", "10"], 1149 ]; 1150 1151 string fpath_data5x25 = buildPath(testDir, "data5x25.tsv"); 1152 string fpath_data5x25_noheader = buildPath(testDir, "data5x25_noheader.tsv"); 1153 writeUnittestTsvFile(fpath_data5x25, data5x25); 1154 writeUnittestTsvFile(fpath_data5x25_noheader, data5x25[1..$]); 1155 1156 string[][] data5x25ExpectedDistinctSampleK2P40 = 1157 [["ID", "Shape", "Color", "Size", "Weight"], 1158 ["03", "square", "black", "L", "20"], 1159 ["05", "ellipse", "red", "S", "20"], 1160 ["08", "square", "black", "S", "10"], 1161 ["10", "square", "green", "L", "20"], 1162 ["13", "ellipse", "red", "S", "20"], 1163 ["15", "ellipse", "red", "L", "30"], 1164 ["16", "square", "red", "S", "10"], 1165 ["18", "square", "red", "S", "20"], 1166 ["19", "square", "black", "L", "20"], 1167 ["21", "ellipse", "black", "L", "30"], 1168 ["24", "square", "green", "L", "20"], 1169 ]; 1170 1171 string[][] data5x25ExpectedDistinctSampleK2K4P20 = 1172 [["ID", "Shape", "Color", "Size", "Weight"], 1173 ["03", "square", "black", "L", "20"], 1174 ["07", "triangle", "red", "L", "20"], 1175 ["08", "square", "black", "S", "10"], 1176 ["10", "square", "green", "L", "20"], 1177 ["11", "triangle", "red", "L", "20"], 1178 ["16", "square", "red", "S", "10"], 1179 ["18", "square", "red", "S", "20"], 1180 ["19", "square", "black", "L", "20"], 1181 ["22", "triangle", "red", "L", "30"], 1182 ["24", "square", "green", "L", "20"], 1183 ]; 1184 1185 string[][] data5x25ExpectedDistinctSampleK2K3K4P20 = 1186 [["ID", "Shape", "Color", "Size", "Weight"], 1187 ["04", "circle", "green", "L", "30"], 1188 ["07", "triangle", "red", "L", "20"], 1189 ["09", "circle", "black", "S", "20"], 1190 ["11", "triangle", "red", "L", "20"], 1191 ["12", "circle", "green", "L", "30"], 1192 ["14", "circle", "green", "L", "30"], 1193 ["16", "square", "red", "S", "10"], 1194 ["18", "square", "red", "S", "20"], 1195 ["22", "triangle", "red", "L", "30"], 1196 ]; 1197 1198 /* 1199 * Enough setup! Actually run some tests! 1200 */ 1201 1202 /* Basic tests. Headers and static seed. With weights and without. */ 1203 testTsvSample(["test-a1", "--header", "--static-seed", fpath_dataEmpty], dataEmpty); 1204 testTsvSample(["test-a2", "--header", "--static-seed", fpath_data3x0], data3x0); 1205 testTsvSample(["test-a3", "-H", "-s", fpath_data3x1], data3x1); 1206 testTsvSample(["test-a4", "-H", "-s", fpath_data3x2], data3x2ExpectedNoWt); 1207 testTsvSample(["test-a5", "-H", "-s", fpath_data3x3], data3x3ExpectedNoWt); 1208 testTsvSample(["test-a6", "-H", "-s", fpath_data3x6], data3x6ExpectedNoWt); 1209 testTsvSample(["test-a7", "-H", "-s", "--print-random", fpath_data3x6], data3x6ExpectedNoWtProbs); 1210 testTsvSample(["test-a8", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedWt3); 1211 testTsvSample(["test-a9", "-H", "-s", "-p", "-w", "3", fpath_data3x6], data3x6ExpectedWt3Probs); 1212 testTsvSample(["test-a10", "-H", "--seed-value", "41", "-p", fpath_data3x6], data3x6ExpectedNoWtV41Probs); 1213 testTsvSample(["test-a11", "-H", "-s", "-v", "41", "-p", fpath_data3x6], data3x6ExpectedNoWtV41Probs); 1214 testTsvSample(["test-a12", "-H", "-s", "-v", "0", "-p", fpath_data3x6], data3x6ExpectedNoWtProbs); 1215 testTsvSample(["test-a13", "-H", "-v", "41", "-w", "3", "-p", fpath_data3x6], data3x6ExpectedWt3V41Probs); 1216 1217 /* Stream sampling cases. */ 1218 testTsvSample(["test-a14", "--header", "--static-seed", "--rate", "0.001", fpath_dataEmpty], dataEmpty); 1219 testTsvSample(["test-a15", "--header", "--static-seed", "--rate", "0.001", fpath_data3x0], data3x0); 1220 testTsvSample(["test-a16", "-H", "-s", "-r", "1.0", fpath_data3x1], data3x1); 1221 testTsvSample(["test-a17", "-H", "-s", "-r", "1.0", fpath_data3x6], data3x6); 1222 testTsvSample(["test-a18", "-H", "-r", "1.0", fpath_data3x6], data3x6); 1223 testTsvSample(["test-a19", "-H", "-s", "--rate", "1.0", "-p", fpath_data3x6], data3x6ExpectedProbsStreamSampleP100); 1224 testTsvSample(["test-a20", "-H", "-s", "--rate", "0.60", "-p", fpath_data3x6], data3x6ExpectedProbsStreamSampleP60); 1225 testTsvSample(["test-a21", "-H", "-s", "--rate", "0.60", fpath_data3x6], data3x6ExpectedStreamSampleP60); 1226 testTsvSample(["test-a22", "-H", "-v", "41", "--rate", "0.60", "-p", fpath_data3x6], data3x6ExpectedV41ProbsStreamSampleP60); 1227 1228 /* Distinct sampling cases. */ 1229 testTsvSample(["test-a23", "--header", "--static-seed", "--rate", "0.001", "--key-fields", "1", fpath_dataEmpty], dataEmpty); 1230 testTsvSample(["test-a24", "--header", "--static-seed", "--rate", "0.001", "--key-fields", "1", fpath_data3x0], data3x0); 1231 testTsvSample(["test-a25", "-H", "-s", "-r", "1.0", "-k", "2", fpath_data3x1], data3x1); 1232 testTsvSample(["test-a26", "-H", "-s", "-r", "1.0", "-k", "2", fpath_data3x6], data3x6); 1233 1234 /* Basic tests, without headers. */ 1235 testTsvSample(["test-b1", "-s", fpath_data3x1_noheader], data3x1[1..$]); 1236 testTsvSample(["test-b2", "-s", fpath_data3x2_noheader], data3x2ExpectedNoWt[1..$]); 1237 testTsvSample(["test-b3", "-s", fpath_data3x3_noheader], data3x3ExpectedNoWt[1..$]); 1238 testTsvSample(["test-b4", "-s", fpath_data3x6_noheader], data3x6ExpectedNoWt[1..$]); 1239 testTsvSample(["test-b5", "-s", "--print-random", fpath_data3x6_noheader], data3x6ExpectedNoWtProbs[1..$]); 1240 testTsvSample(["test-b6", "-s", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedWt3[1..$]); 1241 testTsvSample(["test-b7", "-s", "-p", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedWt3Probs[1..$]); 1242 testTsvSample(["test-b8", "-v", "41", "-p", fpath_data3x6_noheader], data3x6ExpectedNoWtV41Probs[1..$]); 1243 testTsvSample(["test-b9", "-v", "41", "-w", "3", "-p", fpath_data3x6_noheader], data3x6ExpectedWt3V41Probs[1..$]); 1244 1245 /* Stream sampling cases. */ 1246 testTsvSample(["test-b10", "-s", "-r", "1.0", fpath_data3x1_noheader], data3x1[1..$]); 1247 testTsvSample(["test-b11", "-s", "-r", "1.0", fpath_data3x6_noheader], data3x6[1..$]); 1248 testTsvSample(["test-b12", "-r", "1.0", fpath_data3x6_noheader], data3x6[1..$]); 1249 testTsvSample(["test-b13", "-s", "--rate", "1.0", "-p", fpath_data3x6_noheader], data3x6ExpectedProbsStreamSampleP100[1..$]); 1250 testTsvSample(["test-b14", "-s", "--rate", "0.60", "-p", fpath_data3x6_noheader], data3x6ExpectedProbsStreamSampleP60[1..$]); 1251 testTsvSample(["test-b15", "-v", "41", "--rate", "0.60", "-p", fpath_data3x6_noheader], data3x6ExpectedV41ProbsStreamSampleP60[1..$]); 1252 1253 /* Distinct sampling cases. */ 1254 testTsvSample(["test-a25", "-s", "-r", "1.0", "-k", "2", fpath_data3x1_noheader], data3x1[1..$]); 1255 testTsvSample(["test-a26", "-s", "-r", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]); 1256 testTsvSample(["test-a27", "-r", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]); 1257 testTsvSample(["test-a28", "-v", "71563", "-r", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]); 1258 1259 /* Multi-file tests. */ 1260 testTsvSample(["test-c1", "--header", "--static-seed", 1261 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 1262 combo1ExpectedNoWt); 1263 testTsvSample(["test-c2", "--header", "--static-seed", "--print-random", 1264 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 1265 combo1ExpectedNoWtProbs); 1266 testTsvSample(["test-c3", "--header", "--static-seed", "--print-random", "--weight-field", "3", 1267 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 1268 combo1ExpectedWt3Probs); 1269 testTsvSample(["test-c4", "--header", "--static-seed", "--weight-field", "3", 1270 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 1271 combo1ExpectedWt3); 1272 1273 /* Multi-file, no headers. */ 1274 testTsvSample(["test-c5", "--static-seed", 1275 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 1276 fpath_data3x6_noheader, fpath_data3x2_noheader], 1277 combo1ExpectedNoWt[1..$]); 1278 testTsvSample(["test-c6", "--static-seed", "--print-random", 1279 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 1280 fpath_data3x6_noheader, fpath_data3x2_noheader], 1281 combo1ExpectedNoWtProbs[1..$]); 1282 testTsvSample(["test-c7", "--static-seed", "--print-random", "--weight-field", "3", 1283 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 1284 fpath_data3x6_noheader, fpath_data3x2_noheader], 1285 combo1ExpectedWt3Probs[1..$]); 1286 testTsvSample(["test-c8", "--static-seed", "--weight-field", "3", 1287 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 1288 fpath_data3x6_noheader, fpath_data3x2_noheader], 1289 combo1ExpectedWt3[1..$]); 1290 1291 /* Stream sampling cases. */ 1292 testTsvSample(["test-c9", "--header", "--static-seed", "--print-random", "--rate", ".5", 1293 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 1294 combo1ExpectedProbsStreamSampleP50); 1295 testTsvSample(["test-c10", "--header", "--static-seed", "--rate", ".4", 1296 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 1297 combo1ExpectedStreamSampleP40); 1298 testTsvSample(["test-c11", "--static-seed", "--print-random", "--rate", ".5", 1299 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 1300 fpath_data3x6_noheader, fpath_data3x2_noheader], 1301 combo1ExpectedProbsStreamSampleP50[1..$]); 1302 testTsvSample(["test-c12", "--static-seed", "--rate", ".4", 1303 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 1304 fpath_data3x6_noheader, fpath_data3x2_noheader], 1305 combo1ExpectedStreamSampleP40[1..$]); 1306 1307 /* Distinct sampling cases. */ 1308 testTsvSample(["test-c13", "--header", "--static-seed", "--key-fields", "1", "--rate", ".4", 1309 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 1310 combo1ExpectedDistinctSampleK1P40); 1311 testTsvSample(["test-c14", "--static-seed", "--key-fields", "1", "--rate", ".4", 1312 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 1313 fpath_data3x6_noheader, fpath_data3x2_noheader], 1314 combo1ExpectedDistinctSampleK1P40[1..$]); 1315 1316 /* Single column file. */ 1317 testTsvSample(["test-d1", "-H", "-s", fpath_data1x10], data1x10ExpectedNoWt); 1318 testTsvSample(["test-d1", "-H", "-s", fpath_data1x10], data1x10ExpectedNoWt); 1319 1320 /* Distributions. */ 1321 testTsvSample(["test-e1", "-H", "-s", "-w", "2", "-p", fpath_data2x10a], data2x10aExpectedWt2Probs); 1322 testTsvSample(["test-e1", "-H", "-s", "-w", "2", "-p", fpath_data2x10b], data2x10bExpectedWt2Probs); 1323 testTsvSample(["test-e1", "-H", "-s", "-w", "2", "-p", fpath_data2x10c], data2x10cExpectedWt2Probs); 1324 testTsvSample(["test-e1", "-H", "-s", "-w", "2", "-p", fpath_data2x10d], data2x10dExpectedWt2Probs); 1325 testTsvSample(["test-e1", "-H", "-s", "-w", "2", "-p", fpath_data2x10e], data2x10eExpectedWt2Probs); 1326 1327 /* Tests of subset sample (--n|num) field. 1328 * 1329 * Note: The way these tests are done ensures that subset length does not affect 1330 * output order. 1331 */ 1332 import std.algorithm : min; 1333 for (size_t n = data3x6.length + 2; n >= 1; n--) 1334 { 1335 size_t expectedLength = min(data3x6.length, n + 1); 1336 testTsvSample([format("test-f1_%d", n), "-s", "-n", n.to!string, 1337 "-H", fpath_data3x6], data3x6ExpectedNoWt[0..expectedLength]); 1338 1339 testTsvSample([format("test-f2_%d", n), "-s", "-n", n.to!string, 1340 "-H", "-p", fpath_data3x6], data3x6ExpectedNoWtProbs[0..expectedLength]); 1341 1342 testTsvSample([format("test-f3_%d", n), "-s", "-n", n.to!string, 1343 "-H", "-w", "3", fpath_data3x6], data3x6ExpectedWt3[0..expectedLength]); 1344 1345 testTsvSample([format("test-f4_%d", n), "-s", "-n", n.to!string, 1346 "-H", "-p", "-w", "3", fpath_data3x6], data3x6ExpectedWt3Probs[0..expectedLength]); 1347 1348 testTsvSample([format("test-f5_%d", n), "-s", "-n", n.to!string, 1349 fpath_data3x6_noheader], data3x6ExpectedNoWt[1..expectedLength]); 1350 1351 testTsvSample([format("test-f6_%d", n), "-s", "-n", n.to!string, 1352 "-p", fpath_data3x6_noheader], data3x6ExpectedNoWtProbs[1..expectedLength]); 1353 1354 testTsvSample([format("test-f7_%d", n), "-s", "-n", n.to!string, 1355 "-w", "3", fpath_data3x6_noheader], data3x6ExpectedWt3[1..expectedLength]); 1356 1357 testTsvSample([format("test-f8_%d", n), "-s", "-n", n.to!string, 1358 "-p", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedWt3Probs[1..expectedLength]); 1359 1360 import std.algorithm : min; 1361 size_t sampleExpectedLength = min(expectedLength, data3x6ExpectedProbsStreamSampleP60.length); 1362 1363 testTsvSample([format("test-f9_%d", n), "-s", "-r", "0.6", "-n", n.to!string, 1364 "-H", "-p", fpath_data3x6], data3x6ExpectedProbsStreamSampleP60[0..sampleExpectedLength]); 1365 1366 testTsvSample([format("test-f10_%d", n), "-s", "-r", "0.6", "-n", n.to!string, 1367 "-H", fpath_data3x6], data3x6ExpectedStreamSampleP60[0..sampleExpectedLength]); 1368 1369 testTsvSample([format("test-f11_%d", n), "-s", "-r", "0.6", "-n", n.to!string, 1370 "-p", fpath_data3x6_noheader], data3x6ExpectedProbsStreamSampleP60[1..sampleExpectedLength]); 1371 1372 testTsvSample([format("test-f12_%d", n), "-s", "-r", "0.6", "-n", n.to!string, 1373 fpath_data3x6_noheader], data3x6ExpectedStreamSampleP60[1..sampleExpectedLength]); 1374 1375 size_t distinctExpectedLength = min(expectedLength, data3x6ExpectedDistinctSampleK1K3P60.length); 1376 1377 testTsvSample([format("test-f13_%d", n), "-s", "-k", "1,3", "-r", "0.6", "-n", n.to!string, 1378 "-H", fpath_data3x6], data3x6ExpectedDistinctSampleK1K3P60[0..distinctExpectedLength]); 1379 1380 testTsvSample([format("test-f14_%d", n), "-s", "-k", "1,3", "-r", "0.6", "-n", n.to!string, 1381 fpath_data3x6_noheader], data3x6ExpectedDistinctSampleK1K3P60[1..distinctExpectedLength]); 1382 } 1383 1384 /* Similar tests with the 1x10 data set. */ 1385 for (size_t n = data1x10.length + 2; n >= 1; n--) 1386 { 1387 size_t expectedLength = min(data1x10.length, n + 1); 1388 testTsvSample([format("test-g1_%d", n), "-s", "-n", n.to!string, 1389 "-H", fpath_data1x10], data1x10ExpectedNoWt[0..expectedLength]); 1390 1391 testTsvSample([format("test-g2_%d", n), "-s", "-n", n.to!string, 1392 "-H", "-w", "1", fpath_data1x10], data1x10ExpectedWt1[0..expectedLength]); 1393 1394 testTsvSample([format("test-g3_%d", n), "-s", "-n", n.to!string, 1395 fpath_data1x10_noheader], data1x10ExpectedNoWt[1..expectedLength]); 1396 1397 testTsvSample([format("test-g4_%d", n), "-s", "-n", n.to!string, 1398 "-w", "1", fpath_data1x10_noheader], data1x10ExpectedWt1[1..expectedLength]); 1399 } 1400 1401 /* Distinct sampling tests. */ 1402 testTsvSample(["h1", "--header", "--static-seed", "--rate", "0.40", "--key-fields", "2", fpath_data5x25], 1403 data5x25ExpectedDistinctSampleK2P40); 1404 1405 testTsvSample(["h2", "-H", "-s", "-r", "0.20", "-k", "2,4", fpath_data5x25], 1406 data5x25ExpectedDistinctSampleK2K4P20); 1407 1408 testTsvSample(["h3", "-H", "-s", "-r", "0.20", "-k", "2-4", fpath_data5x25], 1409 data5x25ExpectedDistinctSampleK2K3K4P20); 1410 1411 testTsvSample(["h4", "--static-seed", "--rate", "0.40", "--key-fields", "2", fpath_data5x25_noheader], 1412 data5x25ExpectedDistinctSampleK2P40[1..$]); 1413 1414 testTsvSample(["h5", "-s", "-r", "0.20", "-k", "2,4", fpath_data5x25_noheader], 1415 data5x25ExpectedDistinctSampleK2K4P20[1..$]); 1416 1417 testTsvSample(["h6", "-s", "-r", "0.20", "-k", "2-4", fpath_data5x25_noheader], 1418 data5x25ExpectedDistinctSampleK2K3K4P20[1..$]); 1419 }