1 /** 2 Command line tool for randomizing or sampling lines from input streams. Several 3 sampling methods are available, including simple random sampling, weighted random 4 sampling, Bernoulli sampling, and distinct sampling. 5 6 Copyright (c) 2017-2018, eBay Software Foundation 7 Initially written by Jon Degenhardt 8 9 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 10 */ 11 module tsv_sample; 12 13 import std.range; 14 import std.stdio; 15 import std.typecons : tuple, Flag; 16 17 version(unittest) 18 { 19 // When running unit tests, use main from -main compiler switch. 20 } 21 else 22 { 23 int main(string[] cmdArgs) 24 { 25 /* When running in DMD code coverage mode, turn on report merging. */ 26 version(D_Coverage) version(DigitalMars) 27 { 28 import core.runtime : dmd_coverSetMerge; 29 dmd_coverSetMerge(true); 30 } 31 32 TsvSampleOptions cmdopt; 33 auto r = cmdopt.processArgs(cmdArgs); 34 if (!r[0]) return r[1]; 35 version(LDC_Profile) 36 { 37 import ldc.profile : resetAll; 38 resetAll(); 39 } 40 try 41 { 42 import tsvutil : BufferedOutputRange; 43 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 44 45 tsvSample(cmdopt, bufferedOutput); 46 } 47 catch (Exception exc) 48 { 49 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 50 return 1; 51 } 52 return 0; 53 } 54 } 55 56 auto helpText = q"EOS 57 Synopsis: tsv-sample [options] [file...] 58 59 Sample input lines or randomize their order. Several modes of operation 60 are available: 61 * Line order randomization (the default): All input lines are output in a 62 random order. All orderings are equally likely. 63 * Weighted line order randomization (--w|weight-field): Lines are selected 64 using weighted random sampling, with the weight taken from a field. 65 Lines are output in weighted selection order, reordering the lines. 66 * Sampling with replacement (--r|replace, --n|num): All input is read into 67 memory, then lines are repeatedly selected at random and written out. This 68 continues until --n|num samples are output. Lines can be selected multiple 69 times. Output continues forever if --n|num is zero or not specified. 70 * Bernoulli sampling (--p|prob): A random subset of lines is output based 71 on an inclusion probability. This is a streaming operation. A selection 72 decision is made on each line as is it read. Line order is not changed. 73 * Distinct sampling (--k|key-fields, --p|prob): Input lines are sampled 74 based on the values in the key field. A subset of the keys are chosen 75 based on the inclusion probability (a 'distinct' set of keys). All lines 76 with one of the selected keys are output. Line order is not changed. 77 78 The '--n|num' option limits the sample size produced. It speeds up line 79 order randomization and weighted sampling significantly. It is also used 80 to terminate sampling with replacement. 81 82 Use '--help-verbose' for detailed information. 83 84 Options: 85 EOS"; 86 87 auto helpTextVerbose = q"EOS 88 Synopsis: tsv-sample [options] [file...] 89 90 Sample input lines or randomize their order. Several modes of operation 91 are available: 92 * Line order randomization (the default): All input lines are output in a 93 random order. All orderings are equally likely. 94 * Weighted line order randomization (--w|weight-field): Lines are selected 95 using weighted random sampling, with the weight taken from a field. 96 Lines are output in weighted selection order, reordering the lines. 97 * Sampling with replacement (--r|replace, --n|num): All input is read into 98 memory, then lines are repeatedly selected at random and written out. This 99 continues until --n|num samples are output. Lines can be selected multiple 100 times. Output continues forever if --n|num is zero or not specified. 101 * Bernoulli sampling (--p|prob): A random subset of lines is output based 102 on an inclusion probability. This is a streaming operation. A selection 103 decision is made on each line as is it read. Lines order is not changed. 104 * Distinct sampling (--k|key-fields, --p|prob): Input lines are sampled 105 based on the values in the key field. A subset of the keys are chosen 106 based on the inclusion probability (a 'distinct' set of keys). All lines 107 with one of the selected keys are output. Line order is not changed. 108 109 Sample size: The '--n|num' option limits the sample size produced. This 110 speeds up line order randomization and weighted sampling significantly 111 (details below). It is also used to terminate sampling with replacement. 112 113 Controlling the random seed: By default, each run produces a different 114 randomization or sampling. Using '--s|static-seed' changes this so 115 multiple runs produce the same results. This works by using the same 116 random seed each run. The random seed can be specified using 117 '--v|seed-value'. This takes a non-zero, 32-bit positive integer. (A zero 118 value is a no-op and ignored.) 119 120 Memory use: Bernoulli sampling and distinct sampling make decisions on 121 each line as it is read, so there is no memory accumulation. These 122 algorithms support arbitrary size inputs. Sampling with replacement reads 123 all lines into memory and is limited by available memory. The line order 124 randomization algorithms hold the full output set in memory prior to 125 generating results. This ultimately limits the size of the output set. For 126 these memory needs can be reduced by using a sample size (--n|num). This 127 engages reservior sampling. Output order is not affected. Both 128 'tsv-sample -n 1000' and 'tsv-sample | head -n 1000' produce the same 129 results, but the former is quite a bit faster. 130 131 Weighted sampling: Weighted random sampling is done using an algorithm 132 described by Pavlos Efraimidis and Paul Spirakis. Weights should be 133 positive values representing the relative weight of the entry in the 134 collection. Counts and similar can be used as weights, it is *not* 135 necessary to normalize to a [0,1] interval. Negative values are not 136 meaningful and given the value zero. Input order is not retained, instead 137 lines are output ordered by the randomized weight that was assigned. This 138 means that a smaller valid sample can be produced by taking the first N 139 lines of output. For more info on the sampling approach see: 140 * Wikipedia: https://en.wikipedia.org/wiki/Reservoir_sampling 141 * "Weighted Random Sampling over Data Streams", Pavlos S. Efraimidis 142 (https://arxiv.org/abs/1012.0256) 143 144 Printing random values: Most of the algorithms work by generating a random 145 value for each line. The nature of these values depends on the sampling 146 algorithm. They are used for both line selection and output ordering. The 147 '--p|print-random' option can be used to print these values. The random 148 value is prepended to the line separated by the --d|delimiter char (TAB by 149 default). The '--q|gen-random-inorder' option takes this one step further, 150 generating random values for all input lines without changing the input 151 order. The types of values currently used by these sampling algorithms: 152 * Unweighted sampling: Uniform random value in the interval [0,1]. This 153 includes Bernoulli sampling and unweighted line order randomization. 154 * Weighted sampling: Value in the interval [0,1]. Distribution depends on 155 the values in the weight field. It is used as a partial ordering. 156 * Distinct sampling: An integer, zero and up, representing a selection 157 group. The inclusion probability determines the number of selection groups. 158 * Sampling with replacement: Random value printing is not supported. 159 160 The specifics behind these random values are subject to change in future 161 releases. 162 163 Options: 164 EOS"; 165 166 /** Container for command line options. 167 */ 168 struct TsvSampleOptions 169 { 170 string programName; 171 string[] files; 172 bool helpVerbose = false; // --help-verbose 173 bool hasHeader = false; // --H|header 174 size_t sampleSize = 0; // --n|num - Size of the desired sample 175 double inclusionProbability = double.nan; // --p|prob - Inclusion probability 176 size_t[] keyFields; // --k|key-fields - Used with inclusion probability 177 size_t weightField = 0; // --w|weight-field - Field holding the weight 178 bool srsWithReplacement = false; // --r|replace 179 bool staticSeed = false; // --s|static-seed 180 uint seedValueOptionArg = 0; // --v|seed-value 181 bool printRandom = false; // --print-random 182 bool genRandomInorder = false; // --gen-random-inorder 183 string randomValueHeader = "random_value"; // --random-value-header 184 char delim = '\t'; // --d|delimiter 185 bool versionWanted = false; // --V|version 186 bool hasWeightField = false; // Derived. 187 bool useBernoulliSampling = false; // Derived. 188 bool useDistinctSampling = false; // Derived. 189 bool usingUnpredictableSeed = true; // Derived from --static-seed, --seed-value 190 uint seed = 0; // Derived from --static-seed, --seed-value 191 192 auto processArgs(ref string[] cmdArgs) 193 { 194 import std.algorithm : canFind; 195 import std.getopt; 196 import std.math : isNaN; 197 import std.path : baseName, stripExtension; 198 import std.typecons : Yes, No; 199 import tsvutil : makeFieldListOptionHandler; 200 201 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 202 203 try 204 { 205 arraySep = ","; // Use comma to separate values in command line options 206 auto r = getopt( 207 cmdArgs, 208 "help-verbose", " Print more detailed help.", &helpVerbose, 209 210 std.getopt.config.caseSensitive, 211 "H|header", " Treat the first line of each file as a header.", &hasHeader, 212 std.getopt.config.caseInsensitive, 213 214 "n|num", "NUM Maximim number of lines to output. All selected lines are output if not provided or zero.", &sampleSize, 215 "p|prob", "NUM Inclusion probability (0.0 < NUM <= 1.0). For Bernoulli sampling, the probability each line is selected output. For distinct sampling, the probability each unique key is selected for output.", &inclusionProbability, 216 217 "k|key-fields", "<field-list> Fields to use as key for distinct sampling. Use with --p|prob.", 218 keyFields.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex), 219 220 "w|weight-field", "NUM Field containing weights. All lines get equal weight if not provided or zero.", &weightField, 221 "r|replace", " Simple random sampling with replacement. Use --n|num to specify the sample size.", &srsWithReplacement, 222 "s|static-seed", " Use the same random seed every run.", &staticSeed, 223 224 std.getopt.config.caseSensitive, 225 "v|seed-value", "NUM Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg, 226 std.getopt.config.caseInsensitive, 227 228 "print-random", " Include the assigned random value (prepended) when writing output lines.", &printRandom, 229 "gen-random-inorder", " Output all lines with assigned random values prepended, no changes to the order of input.", &genRandomInorder, 230 "random-value-header", " Header to use with --print-random and --gen-random-inorder. Default: 'random_value'.", &randomValueHeader, 231 232 "d|delimiter", "CHR Field delimiter.", &delim, 233 234 std.getopt.config.caseSensitive, 235 "V|version", " Print version information and exit.", &versionWanted, 236 std.getopt.config.caseInsensitive, 237 ); 238 239 if (r.helpWanted) 240 { 241 defaultGetoptPrinter(helpText, r.options); 242 return tuple(false, 0); 243 } 244 else if (helpVerbose) 245 { 246 defaultGetoptPrinter(helpTextVerbose, r.options); 247 return tuple(false, 0); 248 } 249 else if (versionWanted) 250 { 251 import tsvutils_version; 252 writeln(tsvutilsVersionNotice("tsv-sample")); 253 return tuple(false, 0); 254 } 255 256 /* Derivations and validations. */ 257 if (weightField > 0) 258 { 259 hasWeightField = true; 260 weightField--; // Switch to zero-based indexes. 261 } 262 263 if (srsWithReplacement) 264 { 265 if (hasWeightField) 266 { 267 throw new Exception("Sampling with replacement (--r|replace) does not support wieghts (--w|weight-field)."); 268 } 269 else if (!inclusionProbability.isNaN) 270 { 271 throw new Exception("Sampling with replacement (--r|replace) cannot be used with probabilities (--p|prob)."); 272 } 273 else if (keyFields.length > 0) 274 { 275 throw new Exception("Sampling with replacement (--r|replace) cannot be used with distinct sampling (--k|key-fields)."); 276 } 277 else if (printRandom || genRandomInorder) 278 { 279 throw new Exception("Sampling with replacement (--r|replace) does not support random value printing (--print-random, --gen-random-inorder)."); 280 } 281 } 282 283 if (keyFields.length > 0) 284 { 285 if (inclusionProbability.isNaN) throw new Exception("--p|prob is required when using --k|key-fields."); 286 } 287 288 /* Inclusion probability (--p|prob) is used for both Bernoulli sampling and distinct sampling. */ 289 if (!inclusionProbability.isNaN) 290 { 291 if (inclusionProbability <= 0.0 || inclusionProbability > 1.0) 292 { 293 import std.format : format; 294 throw new Exception( 295 format("Invalid --p|prob option: %g. Must satisfy 0.0 < prob <= 1.0.", inclusionProbability)); 296 } 297 298 if (keyFields.length > 0) useDistinctSampling = true; 299 else useBernoulliSampling = true; 300 301 if (hasWeightField) throw new Exception("--w|weight-field and --p|prob cannot be used together."); 302 if (genRandomInorder && !useDistinctSampling) throw new Exception("--q|gen-random-inorder and --p|prob can only be used together if --k|key-fields is also used."); 303 } 304 else if (genRandomInorder && !hasWeightField) 305 { 306 useBernoulliSampling = true; 307 } 308 309 if (randomValueHeader.length == 0 || randomValueHeader.canFind('\n') || 310 randomValueHeader.canFind(delim)) 311 { 312 throw new Exception("--randomValueHeader must be at least one character and not contain field delimiters or newlines."); 313 } 314 315 /* Seed. */ 316 import std.random : unpredictableSeed; 317 318 usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0); 319 320 if (usingUnpredictableSeed) seed = unpredictableSeed; 321 else if (seedValueOptionArg != 0) seed = seedValueOptionArg; 322 else if (staticSeed) seed = 2438424139; 323 else assert(0, "Internal error, invalid seed option states."); 324 325 /* Assume remaining args are files. Use standard input if files were not provided. */ 326 files ~= (cmdArgs.length > 1) ? cmdArgs[1..$] : ["-"]; 327 cmdArgs.length = 1; 328 } 329 catch (Exception exc) 330 { 331 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 332 return tuple(false, 1); 333 } 334 return tuple(true, 0); 335 } 336 } 337 /** Invokes the appropriate sampling routine based on the command line arguments. 338 */ 339 void tsvSample(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 340 if (isOutputRange!(OutputRange, char)) 341 { 342 if (cmdopt.srsWithReplacement) 343 { 344 simpleRandomSamplingWithReplacement(cmdopt, outputStream); 345 } 346 else if (cmdopt.useBernoulliSampling) 347 { 348 if (cmdopt.genRandomInorder) bernoulliSampling!(Yes.generateRandomAll)(cmdopt, outputStream); 349 else bernoulliSampling!(No.generateRandomAll)(cmdopt, outputStream); 350 } 351 else if (cmdopt.useDistinctSampling) 352 { 353 if (cmdopt.genRandomInorder) distinctSampling!(Yes.generateRandomAll)(cmdopt, outputStream); 354 else distinctSampling!(No.generateRandomAll)(cmdopt, outputStream); 355 } 356 else if (cmdopt.genRandomInorder) 357 { 358 assert(cmdopt.hasWeightField); 359 generateWeightedRandomValuesInorder(cmdopt, outputStream); 360 } 361 else if (cmdopt.sampleSize != 0) 362 { 363 if (cmdopt.hasWeightField) reservoirSampling!(Yes.isWeighted)(cmdopt, outputStream); 364 else reservoirSampling!(No.isWeighted)(cmdopt, outputStream); 365 } 366 else 367 { 368 if (cmdopt.hasWeightField) randomizeLines!(Yes.isWeighted)(cmdopt, outputStream); 369 else randomizeLines!(No.isWeighted)(cmdopt, outputStream); 370 } 371 } 372 373 /** Bernoulli sampling on the input stream. Each input line is a assigned a random 374 * value and output if less than the inclusion probability. The order of the lines 375 * is not changed. 376 * 377 * Note: Performance tests show that skip sampling is faster when the inclusion 378 * probability is approximately 4-5% or less. A performance optimization would be to 379 * create a separate function for cases when the probability is small and the random 380 * weights are not being output with each line. A disadvantage would be that the 381 * random weights assigned to each element would change based on the sampling. Printed 382 * weights would no longer be consistent run-to-run. 383 */ 384 void bernoulliSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange) 385 (TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 386 if (isOutputRange!(OutputRange, char)) 387 { 388 import std.format : formatValue, singleSpec; 389 import std.random : Random, uniform01; 390 import tsvutil : throwIfWindowsNewlineOnUnix; 391 392 static if (generateRandomAll) assert(cmdopt.genRandomInorder); 393 else assert(!cmdopt.genRandomInorder); 394 395 auto randomGenerator = Random(cmdopt.seed); 396 immutable randomValueFormatSpec = singleSpec("%.17g"); 397 398 /* Process each line. */ 399 bool headerWritten = false; 400 size_t numLinesWritten = 0; 401 foreach (filename; cmdopt.files) 402 { 403 auto inputStream = (filename == "-") ? stdin : filename.File(); 404 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 405 { 406 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 407 if (fileLineNum == 1 && cmdopt.hasHeader) 408 { 409 if (!headerWritten) 410 { 411 static if (generateRandomAll) 412 { 413 outputStream.put(cmdopt.randomValueHeader); 414 outputStream.put(cmdopt.delim); 415 } 416 else if (cmdopt.printRandom) 417 { 418 outputStream.put(cmdopt.randomValueHeader); 419 outputStream.put(cmdopt.delim); 420 } 421 422 outputStream.put(line); 423 outputStream.put("\n"); 424 headerWritten = true; 425 } 426 } 427 else 428 { 429 double lineScore = uniform01(randomGenerator); 430 431 static if (generateRandomAll) 432 { 433 outputStream.formatValue(lineScore, randomValueFormatSpec); 434 outputStream.put(cmdopt.delim); 435 outputStream.put(line); 436 outputStream.put("\n"); 437 438 if (cmdopt.sampleSize != 0) 439 { 440 ++numLinesWritten; 441 if (numLinesWritten == cmdopt.sampleSize) return; 442 } 443 } 444 else if (lineScore < cmdopt.inclusionProbability) 445 { 446 if (cmdopt.printRandom) 447 { 448 outputStream.formatValue(lineScore, randomValueFormatSpec); 449 outputStream.put(cmdopt.delim); 450 } 451 outputStream.put(line); 452 outputStream.put("\n"); 453 454 if (cmdopt.sampleSize != 0) 455 { 456 ++numLinesWritten; 457 if (numLinesWritten == cmdopt.sampleSize) return; 458 } 459 } 460 } 461 } 462 } 463 } 464 465 /** Sample a subset of the unique values from the key fields. 466 * 467 * Distinct sampling is done by hashing the key and mapping the hash value into 468 * buckets matching the inclusion probability. Records having a key mapping to bucket 469 * zero are output. 470 */ 471 void distinctSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange) 472 (TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 473 if (isOutputRange!(OutputRange, char)) 474 { 475 import std.algorithm : splitter; 476 import std.conv : to; 477 import std.digest.murmurhash; 478 import std.math : lrint; 479 import tsvutil : InputFieldReordering, throwIfWindowsNewlineOnUnix; 480 481 static if (generateRandomAll) assert(cmdopt.genRandomInorder); 482 else assert(!cmdopt.genRandomInorder); 483 484 assert(cmdopt.keyFields.length > 0); 485 assert(0.0 < cmdopt.inclusionProbability && cmdopt.inclusionProbability <= 1.0); 486 487 static if (generateRandomAll) 488 { 489 import std.format : formatValue, singleSpec; 490 immutable randomValueFormatSpec = singleSpec("%d"); 491 } 492 493 immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys. 494 495 uint numBuckets = (1.0 / cmdopt.inclusionProbability).lrint.to!uint; 496 497 /* Create a mapping for the key fields. */ 498 auto keyFieldsReordering = new InputFieldReordering!char(cmdopt.keyFields); 499 500 /* Process each line. */ 501 bool headerWritten = false; 502 size_t numLinesWritten = 0; 503 foreach (filename; cmdopt.files) 504 { 505 auto inputStream = (filename == "-") ? stdin : filename.File(); 506 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 507 { 508 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 509 if (fileLineNum == 1 && cmdopt.hasHeader) 510 { 511 if (!headerWritten) 512 { 513 static if (generateRandomAll) 514 { 515 outputStream.put(cmdopt.randomValueHeader); 516 outputStream.put(cmdopt.delim); 517 } 518 else if (cmdopt.printRandom) 519 { 520 outputStream.put(cmdopt.randomValueHeader); 521 outputStream.put(cmdopt.delim); 522 } 523 524 outputStream.put(line); 525 outputStream.put("\n"); 526 headerWritten = true; 527 } 528 } 529 else 530 { 531 /* Gather the key field values and assemble the key. */ 532 keyFieldsReordering.initNewLine; 533 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 534 { 535 keyFieldsReordering.processNextField(fieldIndex, fieldValue); 536 if (keyFieldsReordering.allFieldsFilled) break; 537 } 538 539 if (!keyFieldsReordering.allFieldsFilled) 540 { 541 import std.format : format; 542 throw new Exception( 543 format("Not enough fields in line. File: %s, Line: %s", 544 (filename == "-") ? "Standard Input" : filename, fileLineNum)); 545 } 546 547 auto hasher = MurmurHash3!32(cmdopt.seed); 548 foreach (count, key; keyFieldsReordering.outputFields.enumerate) 549 { 550 if (count > 0) hasher.put(delimArray); 551 hasher.put(cast(ubyte[]) key); 552 } 553 hasher.finish; 554 555 static if (generateRandomAll) 556 { 557 import std.conv : to; 558 outputStream.formatValue(hasher.get % numBuckets, randomValueFormatSpec); 559 outputStream.put(cmdopt.delim); 560 outputStream.put(line); 561 outputStream.put("\n"); 562 563 if (cmdopt.sampleSize != 0) 564 { 565 ++numLinesWritten; 566 if (numLinesWritten == cmdopt.sampleSize) return; 567 } 568 } 569 else if (hasher.get % numBuckets == 0) 570 { 571 if (cmdopt.printRandom) 572 { 573 outputStream.put('0'); 574 outputStream.put(cmdopt.delim); 575 } 576 outputStream.put(line); 577 outputStream.put("\n"); 578 579 if (cmdopt.sampleSize != 0) 580 { 581 ++numLinesWritten; 582 if (numLinesWritten == cmdopt.sampleSize) return; 583 } 584 } 585 } 586 } 587 } 588 } 589 590 /** An implementation of reservior sampling. Both weighted and uniform random sampling 591 * are supported. 592 * 593 * Both weighted and uniform random sampling are implemented using the one-pass algorithm 594 * described by Pavlos Efraimidis and Paul Spirakis ("Weighted Random Sampling over Data 595 * Streams", Pavlos S. Efraimidis, https://arxiv.org/abs/1012.0256). In the unweighted 596 * case weights are simply set to one. 597 * 598 * The implementation uses a heap (priority queue) large enough to hold the desired 599 * number of lines. Input is read line-by-line, assigned a random value, and added to the 600 * heap. The role of the identify the lines with the highest assigned random values. Once 601 * the heap is full, adding a new line means dropping the line with the lowest score. A 602 * "min" heap used for this reason. 603 * 604 * When done reading all lines, the "min" heap is in the opposite order needed for output. 605 * The desired order is obtained by removing each element one at at time from the heap. 606 * The underlying data store will have the elements in correct order. 607 * 608 * Generating output in weighted order matters for several reasons: 609 * - For weighted sampling, it preserves the property that smaller valid subsets can be 610 * created by taking the first N lines. 611 * - For unweighted sampling, it ensures that all output permutations are possible, and 612 * are not influences by input order or the heap data structure used. 613 * - Order consistency when making repeated use of the same random seeds, but with 614 * different sample sizes. 615 * 616 * There are use cases where only the selection set matters, for these some performance 617 * could be gained by skipping the reordering and simply printing the backing store 618 * array in-order, but making this distinction seems an unnecessary complication. 619 * 620 * Notes: 621 * - In tsv-sample versions 1.2.1 and earlier this routine also supported randomization 622 * of all input lines. This was dropped in version 1.2.2 in favor of the approach 623 * used in randomizeLines. The latter has significant advantages given that all data 624 * data must be read into memory. 625 * - The unweighted case could be sped up by adopting what is commonly known as 626 * "Algorithm R" followed by a random walk on the resulting reservoir (e.g. 627 * std.random.randomCover in the D standard library). This is faster than reversing 628 * the heap prior to output. The downsides are that the result order would not be 629 * consistent with the other routines and that random number printing does not make 630 * sense. Order consistency matters only in the rare case when multiple randomizations 631 * are being done with the same static seed. For a description of Algorithm R see: 632 * https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_R. 633 */ 634 void reservoirSampling(Flag!"isWeighted" isWeighted, OutputRange) 635 (TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 636 if (isOutputRange!(OutputRange, char)) 637 { 638 import std.container.array; 639 import std.container.binaryheap; 640 import std.format : formatValue, singleSpec; 641 import std.random : Random, uniform01; 642 import tsvutil : throwIfWindowsNewlineOnUnix; 643 644 static if (isWeighted) assert(cmdopt.hasWeightField); 645 else assert(!cmdopt.hasWeightField); 646 647 assert(cmdopt.sampleSize > 0); 648 649 auto randomGenerator = Random(cmdopt.seed); 650 651 struct Entry 652 { 653 double score; 654 char[] line; 655 } 656 657 /* Create the heap and backing data store. 658 * 659 * Note: An std.container.array is used as the backing store to avoid some issues in 660 * the standard library (Phobos) binaryheap implementation. Specifically, when an 661 * std.container.array is used as backing store, the heap can efficiently reversed by 662 * removing the heap elements. This leaves the backing store in the reversed order. 663 * However, the current binaryheap implementation does not support this for all 664 * backing stores. See: https://issues.dlang.org/show_bug.cgi?id=17094. 665 */ 666 667 Array!Entry dataStore; 668 dataStore.reserve(cmdopt.sampleSize); 669 auto reservoir = dataStore.heapify!("a.score > b.score")(0); // Min binaryheap 670 671 /* Process each line. */ 672 bool headerWritten = false; 673 foreach (filename; cmdopt.files) 674 { 675 auto inputStream = (filename == "-") ? stdin : filename.File(); 676 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 677 { 678 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 679 if (fileLineNum == 1 && cmdopt.hasHeader) 680 { 681 if (!headerWritten) 682 { 683 if (cmdopt.printRandom) 684 { 685 outputStream.put(cmdopt.randomValueHeader); 686 outputStream.put(cmdopt.delim); 687 } 688 outputStream.put(line); 689 outputStream.put("\n"); 690 headerWritten = true; 691 } 692 } 693 else 694 { 695 static if (!isWeighted) 696 { 697 double lineScore = uniform01(randomGenerator); 698 } 699 else 700 { 701 double lineWeight = 702 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, filename, fileLineNum); 703 double lineScore = 704 (lineWeight > 0.0) 705 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 706 : 0.0; 707 } 708 709 if (reservoir.length < cmdopt.sampleSize) 710 { 711 reservoir.insert(Entry(lineScore, line.dup)); 712 } 713 else if (reservoir.front.score < lineScore) 714 { 715 reservoir.replaceFront(Entry(lineScore, line.dup)); 716 } 717 } 718 } 719 } 720 721 /* All entries are in the reservoir. Time to print. The heap is in reverse order 722 * of assigned weights. Reversing order is done by removing all elements from the 723 * heap, this leaves the backing store in the correct order for output. 724 * 725 * The asserts here avoid issues with the current binaryheap implementation. They 726 * detect use of backing stores having a length not synchronized to the reservoir. 727 */ 728 size_t numLines = reservoir.length; 729 assert(numLines == dataStore.length); 730 731 while (!reservoir.empty) reservoir.removeFront; 732 assert(numLines == dataStore.length); 733 734 immutable randomValueFormatSpec = singleSpec("%.17g"); 735 736 foreach (entry; dataStore) 737 { 738 if (cmdopt.printRandom) 739 { 740 outputStream.formatValue(entry.score, randomValueFormatSpec); 741 outputStream.put(cmdopt.delim); 742 } 743 outputStream.put(entry.line); 744 outputStream.put("\n"); 745 } 746 } 747 748 /** Generates weighted random values for all input lines, preserving input order. 749 * 750 * This complements weighted reservoir sampling, but instead of using a reservoir it 751 * simply iterates over the input lines generating the values. The weighted random 752 * values are generated with the same formula used by reservoirSampling. 753 */ 754 void generateWeightedRandomValuesInorder(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 755 if (isOutputRange!(OutputRange, char)) 756 { 757 import std.format : formatValue, singleSpec; 758 import std.random : Random, uniform01; 759 import tsvutil : throwIfWindowsNewlineOnUnix; 760 761 assert(cmdopt.hasWeightField); 762 763 auto randomGenerator = Random(cmdopt.seed); 764 immutable randomValueFormatSpec = singleSpec("%.17g"); 765 766 /* Process each line. */ 767 bool headerWritten = false; 768 size_t numLinesWritten = 0; 769 foreach (filename; cmdopt.files) 770 { 771 auto inputStream = (filename == "-") ? stdin : filename.File(); 772 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 773 { 774 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 775 if (fileLineNum == 1 && cmdopt.hasHeader) 776 { 777 if (!headerWritten) 778 { 779 outputStream.put(cmdopt.randomValueHeader); 780 outputStream.put(cmdopt.delim); 781 outputStream.put(line); 782 outputStream.put("\n"); 783 headerWritten = true; 784 } 785 } 786 else 787 { 788 double lineWeight = getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, 789 filename, fileLineNum); 790 double lineScore = 791 (lineWeight > 0.0) 792 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 793 : 0.0; 794 795 outputStream.formatValue(lineScore, randomValueFormatSpec); 796 outputStream.put(cmdopt.delim); 797 outputStream.put(line); 798 outputStream.put("\n"); 799 800 if (cmdopt.sampleSize != 0) 801 { 802 ++numLinesWritten; 803 if (numLinesWritten == cmdopt.sampleSize) return; 804 } 805 } 806 } 807 } 808 } 809 810 /** Randomize all the lines in files or standard input. 811 * 812 * All lines in files and/or standard input are read in and written out in random 813 * order. Both simple random sampling and weighted sampling are supported. 814 * 815 * Input data size is limited by available memory. Disk oriented techniques are needed 816 * when data sizes are larger. For example, generating random values line-by-line (ala 817 * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort. 818 * 819 * This approach is significantly faster than reading line-by-line with a heap the 820 * way reservoir sampling does, effectively acknowledging that both approaches 821 * need to read all data into memory when randomizing all lines. 822 * 823 * Note: The unweighted case could be sped up by using std.random.randomShuffle from 824 * the D standard library. This uses an O(n) swapping algorithm to perform the shuffle 825 * rather than the O(n log n) sort approach used here. The downsides are that the 826 * result order would not be consistent with the other routines and that random number 827 * printing does not make sense. Order consistency matters only in the rare case when 828 * multiple randomizations are being done with the same static seed. 829 */ 830 void randomizeLines(Flag!"isWeighted" isWeighted, OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 831 if (isOutputRange!(OutputRange, char)) 832 { 833 import std.algorithm : min, sort, splitter; 834 import std.array : appender; 835 import std.format : formatValue, singleSpec; 836 import std.random : Random, uniform01; 837 import tsvutil : throwIfWindowsNewlineOnUnix; 838 839 static if (isWeighted) assert(cmdopt.hasWeightField); 840 else assert(!cmdopt.hasWeightField); 841 842 assert(cmdopt.sampleSize == 0); 843 844 struct FileData 845 { 846 string filename; 847 char[] data; 848 } 849 850 auto fileData = new FileData[cmdopt.files.length]; 851 852 /* 853 * Read all file data into memory. 854 */ 855 ubyte[1024 * 128] fileRawBuf; 856 foreach (fileNum, filename; cmdopt.files) 857 { 858 fileData[fileNum].filename = filename; 859 auto dataAppender = appender(&(fileData[fileNum].data)); 860 auto ifile = (filename == "-") ? stdin : filename.File; 861 862 if (filename != "-") 863 { 864 ulong filesize = ifile.size; 865 if (filesize < ulong.max) dataAppender.reserve(min(filesize, size_t.max)); 866 } 867 868 foreach (ref ubyte[] buffer; ifile.byChunk(fileRawBuf)) dataAppender.put(cast(char[]) buffer); 869 } 870 871 /* 872 * Split the data into lines and assign a random value to each line. 873 */ 874 struct Entry 875 { 876 double score; 877 char[] line; 878 } 879 880 auto scoredLines = appender!(Entry[]); 881 auto randomGenerator = Random(cmdopt.seed); 882 bool headerWritten = false; 883 884 foreach (fd; fileData) 885 { 886 /* Drop the last newline to avoid adding an extra empty line. */ 887 auto data = (fd.data.length > 0 && fd.data[$ - 1] == '\n') ? fd.data[0 .. $ - 1] : fd.data; 888 foreach (fileLineNum, line; data.splitter('\n').enumerate(1)) 889 { 890 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, fd.filename, fileLineNum); 891 if (fileLineNum == 1 && cmdopt.hasHeader) 892 { 893 if (!headerWritten) 894 { 895 if (cmdopt.printRandom) 896 { 897 outputStream.put(cmdopt.randomValueHeader); 898 outputStream.put(cmdopt.delim); 899 } 900 outputStream.put(line); 901 outputStream.put("\n"); 902 headerWritten = true; 903 } 904 } 905 else 906 { 907 static if (!isWeighted) 908 { 909 double lineScore = uniform01(randomGenerator); 910 } 911 else 912 { 913 double lineWeight = 914 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, fd.filename, fileLineNum); 915 double lineScore = 916 (lineWeight > 0.0) 917 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 918 : 0.0; 919 } 920 921 scoredLines.put(Entry(lineScore, line)); 922 } 923 } 924 } 925 926 /* 927 * Sort by the weight and output the lines. 928 */ 929 scoredLines.data.sort!((a, b) => a.score > b.score); 930 931 immutable randomValueFormatSpec = singleSpec("%.17g"); 932 933 foreach (lineEntry; scoredLines.data) 934 { 935 if (cmdopt.printRandom) 936 { 937 outputStream.formatValue(lineEntry.score, randomValueFormatSpec); 938 outputStream.put(cmdopt.delim); 939 } 940 outputStream.put(lineEntry.line); 941 outputStream.put("\n"); 942 } 943 } 944 945 /** Simple random sampling with replacement. 946 * 947 * All lines in files and/or standard input are read in. Then random lines are selected 948 * one at a time and output. Lines can be selected multiple times. This process continues 949 * until the desired number of samples (--n|num) has been output. Output continues 950 * indefinitely if a sample size was not provided. 951 */ 952 void simpleRandomSamplingWithReplacement(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 953 if (isOutputRange!(OutputRange, char)) 954 { 955 import std.algorithm : each, min, sort, splitter; 956 import std.array : appender; 957 import std.format : formatValue, singleSpec; 958 import std.random : Random, uniform; 959 import tsvutil : throwIfWindowsNewlineOnUnix; 960 961 struct FileData 962 { 963 string filename; 964 char[] data; 965 } 966 967 auto fileData = new FileData[cmdopt.files.length]; 968 969 /* 970 * Read all file data into memory. 971 */ 972 ubyte[1024 * 128] fileRawBuf; 973 foreach (fileNum, filename; cmdopt.files) 974 { 975 fileData[fileNum].filename = filename; 976 auto dataAppender = appender(&(fileData[fileNum].data)); 977 auto ifile = (filename == "-") ? stdin : filename.File; 978 979 if (filename != "-") 980 { 981 ulong filesize = ifile.size; 982 if (filesize < ulong.max) dataAppender.reserve(min(filesize, size_t.max)); 983 } 984 985 foreach (ref ubyte[] buffer; ifile.byChunk(fileRawBuf)) dataAppender.put(cast(char[]) buffer); 986 } 987 988 /* 989 * Split the data into lines. 990 */ 991 struct Entry 992 { 993 char[] line; 994 } 995 996 auto lines = appender!(Entry[]); 997 bool headerWritten = false; 998 999 foreach (fd; fileData) 1000 { 1001 /* Drop the last newline to avoid adding an extra empty line. */ 1002 auto data = (fd.data.length > 0 && fd.data[$ - 1] == '\n') ? fd.data[0 .. $ - 1] : fd.data; 1003 foreach (fileLineNum, line; data.splitter('\n').enumerate(1)) 1004 { 1005 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, fd.filename, fileLineNum); 1006 if (fileLineNum == 1 && cmdopt.hasHeader) 1007 { 1008 if (!headerWritten) 1009 { 1010 outputStream.put(line); 1011 outputStream.put("\n"); 1012 headerWritten = true; 1013 } 1014 } 1015 else 1016 { 1017 lines.put(Entry(line)); 1018 } 1019 } 1020 } 1021 1022 if (lines.data.length > 0) 1023 { 1024 auto randomGenerator = Random(cmdopt.seed); 1025 1026 /* Repeat forever is sampleSize is zero, otherwise print sampleSize lines. */ 1027 size_t numLeft = (cmdopt.sampleSize == 0) ? 1 : cmdopt.sampleSize; 1028 while (numLeft != 0) 1029 { 1030 size_t index = uniform(0, lines.data.length, randomGenerator); 1031 outputStream.put(lines.data[index].line); 1032 outputStream.put("\n"); 1033 if (cmdopt.sampleSize != 0) numLeft--; 1034 } 1035 } 1036 } 1037 1038 1039 /** Convenience function for extracting a single field from a line. See getTsvFieldValue in 1040 * common/src/tsvutils.d for details. This wrapper creates error text tailored for this program. 1041 */ 1042 import std.traits : isSomeChar; 1043 T getFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim, string filename, size_t lineNum) pure @safe 1044 if (isSomeChar!C) 1045 { 1046 import std.conv : ConvException, to; 1047 import std.format : format; 1048 import tsvutil : getTsvFieldValue; 1049 1050 T val; 1051 try 1052 { 1053 val = getTsvFieldValue!T(line, fieldIndex, delim); 1054 } 1055 catch (ConvException exc) 1056 { 1057 throw new Exception( 1058 format("Could not process line: %s\n File: %s Line: %s%s", 1059 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum, 1060 (lineNum == 1) ? "\n Is this a header line? Use --H|header to skip." : "")); 1061 } 1062 catch (Exception exc) 1063 { 1064 /* Not enough fields on the line. */ 1065 throw new Exception( 1066 format("Could not process line: %s\n File: %s Line: %s", 1067 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum)); 1068 } 1069 1070 return val; 1071 } 1072 1073 unittest 1074 { 1075 /* getFieldValue unit tests. getTsvFieldValue has it's own tests. 1076 * These tests make basic sanity checks on the getFieldValue wrapper. 1077 */ 1078 import std.exception; 1079 1080 assert(getFieldValue!double("123", 0, '\t', "unittest", 1) == 123); 1081 assert(getFieldValue!double("123.4", 0, '\t', "unittest", 1) == 123.4); 1082 assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 1)); 1083 assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 2)); 1084 assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 1)); 1085 assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 2)); 1086 } 1087 1088 /* Unit tests for the main program start here. 1089 * 1090 * Portability note: Many of the tests here rely on generating consistent random numbers 1091 * across different platforms when using the same random seed. So far this has succeeded 1092 * on several different platorm, compiler, and library versions. However, it is certainly 1093 * possible this condition will not hold on other platforms. 1094 * 1095 * For tsv-sample, this portability implies generating the same results on different 1096 * platforms when using the same random seed. This is NOT part of tsv-sample guarantees, 1097 * but it is convenient for testing. If platforms are identified that do not generate 1098 * the same results these tests will need to be adjusted. 1099 */ 1100 version(unittest) 1101 { 1102 /* Unit test helper functions. */ 1103 1104 import unittest_utils; // tsv unit test helpers, from common/src/. 1105 import std.conv : to; 1106 1107 void testTsvSample(string[] cmdArgs, string[][] expected) 1108 { 1109 import std.array : appender; 1110 import std.format : format; 1111 1112 assert(cmdArgs.length > 0, "[testTsvSample] cmdArgs must not be empty."); 1113 1114 auto formatAssertMessage(T...)(string msg, T formatArgs) 1115 { 1116 auto formatString = "[testTsvSample] %s: " ~ msg; 1117 return format(formatString, cmdArgs[0], formatArgs); 1118 } 1119 1120 TsvSampleOptions cmdopt; 1121 auto savedCmdArgs = cmdArgs.to!string; 1122 auto r = cmdopt.processArgs(cmdArgs); 1123 assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); 1124 auto output = appender!(char[])(); 1125 1126 tsvSample(cmdopt, output); // This invokes the main code line. 1127 1128 auto expectedOutput = expected.tsvDataToString; 1129 1130 assert(output.data == expectedOutput, 1131 formatAssertMessage( 1132 "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================", 1133 expectedOutput.to!string, output.data.to!string)); 1134 } 1135 } 1136 1137 unittest 1138 { 1139 import std.path : buildPath; 1140 import std.file : rmdirRecurse; 1141 import std.format : format; 1142 1143 auto testDir = makeUnittestTempDir("tsv_sample"); 1144 scope(exit) testDir.rmdirRecurse; 1145 1146 /* Tabular data sets and expected results use the built-in static seed. 1147 * Tests are run by writing the data set to a file, then calling the main 1148 * routine to process. The function testTsvSample plays the role of the 1149 * main program. Rather than writing to expected output, the results are 1150 * matched against expected. The expected results were verified by hand 1151 * prior to inclusion in the test. 1152 * 1153 * The initial part of this section is simply setting up data files and 1154 * expected results. 1155 */ 1156 1157 /* Empty file. */ 1158 string[][] dataEmpty = []; 1159 string fpath_dataEmpty = buildPath(testDir, "dataEmpty.tsv"); 1160 writeUnittestTsvFile(fpath_dataEmpty, dataEmpty); 1161 1162 /* 3x1, header only. */ 1163 string[][] data3x0 = [["field_a", "field_b", "field_c"]]; 1164 string fpath_data3x0 = buildPath(testDir, "data3x0.tsv"); 1165 writeUnittestTsvFile(fpath_data3x0, data3x0); 1166 1167 /* 3x1 */ 1168 string[][] data3x1 = 1169 [["field_a", "field_b", "field_c"], 1170 ["tan", "タン", "8.5"]]; 1171 1172 string fpath_data3x1 = buildPath(testDir, "data3x1.tsv"); 1173 string fpath_data3x1_noheader = buildPath(testDir, "data3x1_noheader.tsv"); 1174 writeUnittestTsvFile(fpath_data3x1, data3x1); 1175 writeUnittestTsvFile(fpath_data3x1_noheader, data3x1[1..$]); 1176 1177 string[][] data3x1ExpectedReplace3 = 1178 [["field_a", "field_b", "field_c"], 1179 ["tan", "タン", "8.5"], 1180 ["tan", "タン", "8.5"], 1181 ["tan", "タン", "8.5"]]; 1182 1183 /* 3x2 */ 1184 string[][] data3x2 = 1185 [["field_a", "field_b", "field_c"], 1186 ["brown", "褐色", "29.2"], 1187 ["gray", "グレー", "6.2"]]; 1188 1189 string fpath_data3x2 = buildPath(testDir, "data3x2.tsv"); 1190 string fpath_data3x2_noheader = buildPath(testDir, "data3x2_noheader.tsv"); 1191 writeUnittestTsvFile(fpath_data3x2, data3x2); 1192 writeUnittestTsvFile(fpath_data3x2_noheader, data3x2[1..$]); 1193 1194 string[][] data3x2ExpectedNoWt = 1195 [["field_a", "field_b", "field_c"], 1196 ["gray", "グレー", "6.2"], 1197 ["brown", "褐色", "29.2"]]; 1198 1199 /* 3x3 */ 1200 string[][] data3x3 = 1201 [["field_a", "field_b", "field_c"], 1202 ["orange", "オレンジ", "2.5"], 1203 ["pink", "ピンク", "1.1"], 1204 ["purple", "紫の", "42"]]; 1205 1206 string fpath_data3x3 = buildPath(testDir, "data3x3.tsv"); 1207 string fpath_data3x3_noheader = buildPath(testDir, "data3x3_noheader.tsv"); 1208 writeUnittestTsvFile(fpath_data3x3, data3x3); 1209 writeUnittestTsvFile(fpath_data3x3_noheader, data3x3[1..$]); 1210 1211 string[][] data3x3ExpectedNoWt = 1212 [["field_a", "field_b", "field_c"], 1213 ["purple", "紫の", "42"], 1214 ["pink", "ピンク", "1.1"], 1215 ["orange", "オレンジ", "2.5"]]; 1216 1217 /* 3x6 */ 1218 string[][] data3x6 = 1219 [["field_a", "field_b", "field_c"], 1220 ["red", "赤", "23.8"], 1221 ["green", "緑", "0.0072"], 1222 ["white", "白", "1.65"], 1223 ["yellow", "黄", "12"], 1224 ["blue", "青", "12"], 1225 ["black", "黒", "0.983"]]; 1226 string fpath_data3x6 = buildPath(testDir, "data3x6.tsv"); 1227 string fpath_data3x6_noheader = buildPath(testDir, "data3x6_noheader.tsv"); 1228 writeUnittestTsvFile(fpath_data3x6, data3x6); 1229 writeUnittestTsvFile(fpath_data3x6_noheader, data3x6[1..$]); 1230 1231 string[][] data3x6ExpectedNoWt = 1232 [["field_a", "field_b", "field_c"], 1233 ["yellow", "黄", "12"], 1234 ["black", "黒", "0.983"], 1235 ["blue", "青", "12"], 1236 ["white", "白", "1.65"], 1237 ["green", "緑", "0.0072"], 1238 ["red", "赤", "23.8"]]; 1239 1240 string[][] data3x6ExpectedNoWtProbs = 1241 [["random_value", "field_a", "field_b", "field_c"], 1242 ["0.96055546286515892", "yellow", "黄", "12"], 1243 ["0.7571015392895788", "black", "黒", "0.983"], 1244 ["0.52525980887003243", "blue", "青", "12"], 1245 ["0.49287854949943721", "white", "白", "1.65"], 1246 ["0.15929344086907804", "green", "緑", "0.0072"], 1247 ["0.010968807619065046", "red", "赤", "23.8"]]; 1248 1249 string[][] data3x6ExpectedProbsBernoulliSampleP100 = 1250 [["random_value", "field_a", "field_b", "field_c"], 1251 ["0.010968807619065046", "red", "赤", "23.8"], 1252 ["0.15929344086907804", "green", "緑", "0.0072"], 1253 ["0.49287854949943721", "white", "白", "1.65"], 1254 ["0.96055546286515892", "yellow", "黄", "12"], 1255 ["0.52525980887003243", "blue", "青", "12"], 1256 ["0.7571015392895788", "black", "黒", "0.983"]]; 1257 1258 string[][] data3x6ExpectedProbsBernoulliSampleP60 = 1259 [["random_value", "field_a", "field_b", "field_c"], 1260 ["0.010968807619065046", "red", "赤", "23.8"], 1261 ["0.15929344086907804", "green", "緑", "0.0072"], 1262 ["0.49287854949943721", "white", "白", "1.65"], 1263 ["0.52525980887003243", "blue", "青", "12"]]; 1264 1265 string[][] data3x6ExpectedBernoulliSampleP60 = 1266 [["field_a", "field_b", "field_c"], 1267 ["red", "赤", "23.8"], 1268 ["green", "緑", "0.0072"], 1269 ["white", "白", "1.65"], 1270 ["blue", "青", "12"]]; 1271 1272 string[][] data3x6ExpectedDistinctSampleK1K3P60 = 1273 [["field_a", "field_b", "field_c"], 1274 ["green", "緑", "0.0072"], 1275 ["white", "白", "1.65"], 1276 ["blue", "青", "12"]]; 1277 1278 string[][] data3x6ExpectedDistinctSampleK1K3P60Probs = 1279 [["random_value", "field_a", "field_b", "field_c"], 1280 ["0", "green", "緑", "0.0072"], 1281 ["0", "white", "白", "1.65"], 1282 ["0", "blue", "青", "12"]]; 1283 1284 string[][] data3x6ExpectedDistinctSampleK1K3P60ProbsRVCustom = 1285 [["custom_random_value_header", "field_a", "field_b", "field_c"], 1286 ["0", "green", "緑", "0.0072"], 1287 ["0", "white", "白", "1.65"], 1288 ["0", "blue", "青", "12"]]; 1289 1290 string[][] data3x6ExpectedDistinctSampleK2P2ProbsInorder = 1291 [["random_value", "field_a", "field_b", "field_c"], 1292 ["1", "red", "赤", "23.8"], 1293 ["0", "green", "緑", "0.0072"], 1294 ["0", "white", "白", "1.65"], 1295 ["1", "yellow", "黄", "12"], 1296 ["3", "blue", "青", "12"], 1297 ["2", "black", "黒", "0.983"]]; 1298 1299 string[][] data3x6ExpectedWt3Probs = 1300 [["random_value", "field_a", "field_b", "field_c"], 1301 ["0.9966519875764539", "yellow", "黄", "12"], 1302 ["0.94775884809836686", "blue", "青", "12"], 1303 ["0.82728234682286661", "red", "赤", "23.8"], 1304 ["0.75346697377181959", "black", "黒", "0.983"], 1305 ["0.65130103496422487", "white", "白", "1.65"], 1306 ["1.5636943712879866e-111", "green", "緑", "0.0072"]]; 1307 1308 string[][] data3x6ExpectedWt3ProbsInorder = 1309 [["random_value", "field_a", "field_b", "field_c"], 1310 ["0.82728234682286661", "red", "赤", "23.8"], 1311 ["1.5636943712879866e-111", "green", "緑", "0.0072"], 1312 ["0.65130103496422487", "white", "白", "1.65"], 1313 ["0.9966519875764539", "yellow", "黄", "12"], 1314 ["0.94775884809836686", "blue", "青", "12"], 1315 ["0.75346697377181959", "black", "黒", "0.983"]]; 1316 1317 string[][] data3x6ExpectedWt3 = 1318 [["field_a", "field_b", "field_c"], 1319 ["yellow", "黄", "12"], 1320 ["blue", "青", "12"], 1321 ["red", "赤", "23.8"], 1322 ["black", "黒", "0.983"], 1323 ["white", "白", "1.65"], 1324 ["green", "緑", "0.0072"]]; 1325 1326 string[][] data3x6ExpectedReplace10 = 1327 [["field_a", "field_b", "field_c"], 1328 ["black", "黒", "0.983"], 1329 ["green", "緑", "0.0072"], 1330 ["green", "緑", "0.0072"], 1331 ["red", "赤", "23.8"], 1332 ["yellow", "黄", "12"], 1333 ["red", "赤", "23.8"], 1334 ["white", "白", "1.65"], 1335 ["yellow", "黄", "12"], 1336 ["yellow", "黄", "12"], 1337 ["white", "白", "1.65"], 1338 ]; 1339 1340 string[][] data3x6ExpectedReplace10V77 = 1341 [["field_a", "field_b", "field_c"], 1342 ["black", "黒", "0.983"], 1343 ["red", "赤", "23.8"], 1344 ["black", "黒", "0.983"], 1345 ["yellow", "黄", "12"], 1346 ["green", "緑", "0.0072"], 1347 ["green", "緑", "0.0072"], 1348 ["green", "緑", "0.0072"], 1349 ["yellow", "黄", "12"], 1350 ["blue", "青", "12"], 1351 ["white", "白", "1.65"], 1352 ]; 1353 1354 /* Using a different static seed. */ 1355 string[][] data3x6ExpectedNoWtV41Probs = 1356 [["random_value", "field_a", "field_b", "field_c"], 1357 ["0.68057272653095424", "green", "緑", "0.0072"], 1358 ["0.67681624367833138", "blue", "青", "12"], 1359 ["0.32097338931635022", "yellow", "黄", "12"], 1360 ["0.25092361867427826", "red", "赤", "23.8"], 1361 ["0.15535934292711318", "black", "黒", "0.983"], 1362 ["0.04609582107514143", "white", "白", "1.65"]]; 1363 1364 string[][] data3x6ExpectedV41ProbsBernoulliSampleP60 = 1365 [["random_value", "field_a", "field_b", "field_c"], 1366 ["0.25092361867427826", "red", "赤", "23.8"], 1367 ["0.04609582107514143", "white", "白", "1.65"], 1368 ["0.32097338931635022", "yellow", "黄", "12"], 1369 ["0.15535934292711318", "black", "黒", "0.983"]]; 1370 1371 string[][] data3x6ExpectedWt3V41Probs = 1372 [["random_value", "field_a", "field_b", "field_c"], 1373 ["0.96799377498910666", "blue", "青", "12"], 1374 ["0.94356245792573568", "red", "赤", "23.8"], 1375 ["0.90964601024271996", "yellow", "黄", "12"], 1376 ["0.15491658409260103", "white", "白", "1.65"], 1377 ["0.15043620392537033", "black", "黒", "0.983"], 1378 ["6.1394674830701461e-24", "green", "緑", "0.0072"]]; 1379 1380 string[][] data3x6ExpectedWt3V41ProbsInorder = 1381 [["random_value", "field_a", "field_b", "field_c"], 1382 ["0.94356245792573568", "red", "赤", "23.8"], 1383 ["6.1394674830701461e-24", "green", "緑", "0.0072"], 1384 ["0.15491658409260103", "white", "白", "1.65"], 1385 ["0.90964601024271996", "yellow", "黄", "12"], 1386 ["0.96799377498910666", "blue", "青", "12"], 1387 ["0.15043620392537033", "black", "黒", "0.983"]]; 1388 1389 1390 /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */ 1391 string[][] combo1ExpectedNoWt = 1392 [["field_a", "field_b", "field_c"], 1393 ["yellow", "黄", "12"], 1394 ["tan", "タン", "8.5"], 1395 ["brown", "褐色", "29.2"], 1396 ["green", "緑", "0.0072"], 1397 ["red", "赤", "23.8"], 1398 ["purple", "紫の", "42"], 1399 ["black", "黒", "0.983"], 1400 ["white", "白", "1.65"], 1401 ["gray", "グレー", "6.2"], 1402 ["blue", "青", "12"], 1403 ["pink", "ピンク", "1.1"], 1404 ["orange", "オレンジ", "2.5"]]; 1405 1406 string[][] combo1ExpectedNoWtProbs = 1407 [["random_value", "field_a", "field_b", "field_c"], 1408 ["0.97088520275428891", "yellow", "黄", "12"], 1409 ["0.96055546286515892", "tan", "タン", "8.5"], 1410 ["0.81756894313730299", "brown", "褐色", "29.2"], 1411 ["0.7571015392895788", "green", "緑", "0.0072"], 1412 ["0.52525980887003243", "red", "赤", "23.8"], 1413 ["0.49287854949943721", "purple", "紫の", "42"], 1414 ["0.47081507067196071", "black", "黒", "0.983"], 1415 ["0.38388182921335101", "white", "白", "1.65"], 1416 ["0.29215990612283349", "gray", "グレー", "6.2"], 1417 ["0.24033216014504433", "blue", "青", "12"], 1418 ["0.15929344086907804", "pink", "ピンク", "1.1"], 1419 ["0.010968807619065046", "orange", "オレンジ", "2.5"]]; 1420 1421 /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */ 1422 string[][] combo1ExpectedNoWtProbsInorder = 1423 [["random_value", "field_a", "field_b", "field_c"], 1424 ["0.010968807619065046", "orange", "オレンジ", "2.5"], 1425 ["0.15929344086907804", "pink", "ピンク", "1.1"], 1426 ["0.49287854949943721", "purple", "紫の", "42"], 1427 ["0.96055546286515892", "tan", "タン", "8.5"], 1428 ["0.52525980887003243", "red", "赤", "23.8"], 1429 ["0.7571015392895788", "green", "緑", "0.0072"], 1430 ["0.38388182921335101", "white", "白", "1.65"], 1431 ["0.97088520275428891", "yellow", "黄", "12"], 1432 ["0.24033216014504433", "blue", "青", "12"], 1433 ["0.47081507067196071", "black", "黒", "0.983"], 1434 ["0.81756894313730299", "brown", "褐色", "29.2"], 1435 ["0.29215990612283349", "gray", "グレー", "6.2"]]; 1436 1437 string[][] combo1ExpectedProbsBernoulliSampleP50 = 1438 [["random_value", "field_a", "field_b", "field_c"], 1439 ["0.010968807619065046", "orange", "オレンジ", "2.5"], 1440 ["0.15929344086907804", "pink", "ピンク", "1.1"], 1441 ["0.49287854949943721", "purple", "紫の", "42"], 1442 ["0.38388182921335101", "white", "白", "1.65"], 1443 ["0.24033216014504433", "blue", "青", "12"], 1444 ["0.47081507067196071", "black", "黒", "0.983"], 1445 ["0.29215990612283349", "gray", "グレー", "6.2"]]; 1446 1447 string[][] combo1ExpectedBernoulliSampleP40 = 1448 [["field_a", "field_b", "field_c"], 1449 ["orange", "オレンジ", "2.5"], 1450 ["pink", "ピンク", "1.1"], 1451 ["white", "白", "1.65"], 1452 ["blue", "青", "12"], 1453 ["gray", "グレー", "6.2"]]; 1454 1455 string[][] combo1ExpectedDistinctSampleK1P40 = 1456 [["field_a", "field_b", "field_c"], 1457 ["orange", "オレンジ", "2.5"], 1458 ["red", "赤", "23.8"], 1459 ["green", "緑", "0.0072"], 1460 ["blue", "青", "12"], 1461 ["black", "黒", "0.983"]]; 1462 1463 string[][] combo1ExpectedWt3Probs = 1464 [["random_value", "field_a", "field_b", "field_c"], 1465 ["0.99754077523718754", "yellow", "黄", "12"], 1466 ["0.99527665440088786", "tan", "タン", "8.5"], 1467 ["0.99312578945741659", "brown", "褐色", "29.2"], 1468 ["0.98329602553389361", "purple", "紫の", "42"], 1469 ["0.9733096193808366", "red", "赤", "23.8"], 1470 ["0.88797551521739648", "blue", "青", "12"], 1471 ["0.81999230489041786", "gray", "グレー", "6.2"], 1472 ["0.55975569204250941", "white", "白", "1.65"], 1473 ["0.46472135609205739", "black", "黒", "0.983"], 1474 ["0.18824582704191337", "pink", "ピンク", "1.1"], 1475 ["0.1644613185329992", "orange", "オレンジ", "2.5"], 1476 ["1.6438086931020549e-17", "green", "緑", "0.0072"]]; 1477 1478 string[][] combo1ExpectedWt3 = 1479 [["field_a", "field_b", "field_c"], 1480 ["yellow", "黄", "12"], 1481 ["tan", "タン", "8.5"], 1482 ["brown", "褐色", "29.2"], 1483 ["purple", "紫の", "42"], 1484 ["red", "赤", "23.8"], 1485 ["blue", "青", "12"], 1486 ["gray", "グレー", "6.2"], 1487 ["white", "白", "1.65"], 1488 ["black", "黒", "0.983"], 1489 ["pink", "ピンク", "1.1"], 1490 ["orange", "オレンジ", "2.5"], 1491 ["green", "緑", "0.0072"]]; 1492 1493 string[][] combo1ExpectedReplace10 = 1494 [["field_a", "field_b", "field_c"], 1495 ["gray", "グレー", "6.2"], 1496 ["yellow", "黄", "12"], 1497 ["yellow", "黄", "12"], 1498 ["white", "白", "1.65"], 1499 ["tan", "タン", "8.5"], 1500 ["white", "白", "1.65"], 1501 ["blue", "青", "12"], 1502 ["black", "黒", "0.983"], 1503 ["tan", "タン", "8.5"], 1504 ["purple", "紫の", "42"]]; 1505 1506 /* 1x10 - Simple 1-column file. */ 1507 string[][] data1x10 = 1508 [["field_a"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"], ["10"]]; 1509 string fpath_data1x10 = buildPath(testDir, "data1x10.tsv"); 1510 string fpath_data1x10_noheader = buildPath(testDir, "data1x10_noheader.tsv"); 1511 writeUnittestTsvFile(fpath_data1x10, data1x10); 1512 writeUnittestTsvFile(fpath_data1x10_noheader, data1x10[1..$]); 1513 1514 string[][] data1x10ExpectedNoWt = 1515 [["field_a"], ["8"], ["4"], ["6"], ["5"], ["3"], ["10"], ["7"], ["9"], ["2"], ["1"]]; 1516 1517 string[][] data1x10ExpectedWt1 = 1518 [["field_a"], ["8"], ["4"], ["6"], ["10"], ["5"], ["7"], ["9"], ["3"], ["2"], ["1"]]; 1519 1520 /* 2x10a - Uniform distribution [0,1]. */ 1521 string[][] data2x10a = 1522 [["line", "weight"], 1523 ["1", "0.26788837"], 1524 ["2", "0.06601298"], 1525 ["3", "0.38627527"], 1526 ["4", "0.47379424"], 1527 ["5", "0.02966641"], 1528 ["6", "0.05636231"], 1529 ["7", "0.70529242"], 1530 ["8", "0.91836862"], 1531 ["9", "0.99103720"], 1532 ["10", "0.31401740"]]; 1533 1534 string fpath_data2x10a = buildPath(testDir, "data2x10a.tsv"); 1535 writeUnittestTsvFile(fpath_data2x10a, data2x10a); 1536 1537 string[][] data2x10aExpectedWt2Probs = 1538 [["random_value", "line", "weight"], 1539 ["0.96833865494543658", "8", "0.91836862"], 1540 ["0.91856842054413923", "4", "0.47379424"], 1541 ["0.25730832087795091", "7", "0.70529242"], 1542 ["0.2372531790701812", "9", "0.99103720"], 1543 ["0.16016096701872204", "3", "0.38627527"], 1544 ["0.090819662667243381", "10", "0.31401740"], 1545 ["0.0071764539244361172", "6", "0.05636231"], 1546 ["4.8318642951630057e-08", "1", "0.26788837"], 1547 ["3.7525692966535517e-10", "5", "0.02966641"], 1548 ["8.2123247880095796e-13", "2", "0.06601298"]]; 1549 1550 /* 2x10b - Uniform distribution [0,1000]. */ 1551 string[][] data2x10b = 1552 [["line", "weight"], 1553 ["1", "761"], 1554 ["2", "432"], 1555 ["3", "103"], 1556 ["4", "448"], 1557 ["5", "750"], 1558 ["6", "711"], 1559 ["7", "867"], 1560 ["8", "841"], 1561 ["9", "963"], 1562 ["10", "784"]]; 1563 1564 string fpath_data2x10b = buildPath(testDir, "data2x10b.tsv"); 1565 writeUnittestTsvFile(fpath_data2x10b, data2x10b); 1566 1567 string[][] data2x10bExpectedWt2Probs = 1568 [["random_value", "line", "weight"], 1569 ["0.99996486739067969", "8", "841"], 1570 ["0.99991017467137211", "4", "448"], 1571 ["0.99960871524873662", "6", "711"], 1572 ["0.999141885371438", "5", "750"], 1573 ["0.99903963250274785", "10", "784"], 1574 ["0.99889631825931946", "7", "867"], 1575 ["0.99852058315191139", "9", "963"], 1576 ["0.99575669679158918", "2", "432"], 1577 ["0.99408758732050595", "1", "761"], 1578 ["0.99315467761212362", "3", "103"]]; 1579 1580 /* 2x10c - Logarithmic distribution in random order. */ 1581 string[][] data2x10c = 1582 [["line", "weight"], 1583 ["1", "31.85"], 1584 ["2", "17403.31"], 1585 ["3", "653.84"], 1586 ["4", "8.23"], 1587 ["5", "2671.04"], 1588 ["6", "26226.08"], 1589 ["7", "1.79"], 1590 ["8", "354.56"], 1591 ["9", "35213.81"], 1592 ["10", "679.29"]]; 1593 1594 string fpath_data2x10c = buildPath(testDir, "data2x10c.tsv"); 1595 writeUnittestTsvFile(fpath_data2x10c, data2x10c); 1596 1597 string[][] data2x10cExpectedWt2Probs = 1598 [["random_value", "line", "weight"], 1599 ["0.99998939008709697", "6", "26226.08"], 1600 ["0.99995951291695517", "9", "35213.81"], 1601 ["0.99991666907613541", "8", "354.56"], 1602 ["0.9998944505218641", "2", "17403.31"], 1603 ["0.9997589760286163", "5", "2671.04"], 1604 ["0.99891852769877643", "3", "653.84"], 1605 ["0.99889167752782515", "10", "679.29"], 1606 ["0.99512207506850148", "4", "8.23"], 1607 ["0.86789371584259023", "1", "31.85"], 1608 ["0.5857443816291561", "7", "1.79"]]; 1609 1610 /* 2x10d. Logarithmic distribution in ascending order. */ 1611 string[][] data2x10d = 1612 [["line", "weight"], 1613 ["1", "1.79"], 1614 ["2", "8.23"], 1615 ["3", "31.85"], 1616 ["4", "354.56"], 1617 ["5", "653.84"], 1618 ["6", "679.29"], 1619 ["7", "2671.04"], 1620 ["8", "17403.31"], 1621 ["9", "26226.08"], 1622 ["10", "35213.81"]]; 1623 1624 string fpath_data2x10d = buildPath(testDir, "data2x10d.tsv"); 1625 writeUnittestTsvFile(fpath_data2x10d, data2x10d); 1626 1627 string[][] data2x10dExpectedWt2Probs = 1628 [["random_value", "line", "weight"], 1629 ["0.99999830221846353", "8", "17403.31"], 1630 ["0.99997860834041397", "10", "35213.81"], 1631 ["0.99994563828986716", "9", "26226.08"], 1632 ["0.99988650363575737", "4", "354.56"], 1633 ["0.99964161939190088", "7", "2671.04"], 1634 ["0.99959045338948649", "6", "679.29"], 1635 ["0.99901574490639788", "5", "653.84"], 1636 ["0.97803163304747431", "3", "31.85"], 1637 ["0.79994791806910948", "2", "8.23"], 1638 ["0.080374261239949119", "1", "1.79"]]; 1639 1640 /* 2x10e. Logarithmic distribution in descending order. */ 1641 string[][] data2x10e = 1642 [["line", "weight"], 1643 ["1", "35213.81"], 1644 ["2", "26226.08"], 1645 ["3", "17403.31"], 1646 ["4", "2671.04"], 1647 ["5", "679.29"], 1648 ["6", "653.84"], 1649 ["7", "354.56"], 1650 ["8", "31.85"], 1651 ["9", "8.23"], 1652 ["10", "1.79"]]; 1653 string fpath_data2x10e = buildPath(testDir, "data2x10e.tsv"); 1654 writeUnittestTsvFile(fpath_data2x10e, data2x10e); 1655 1656 string[][] data2x10eExpectedWt2Probs = 1657 [["random_value", "line", "weight"], 1658 ["0.99998493348975237", "4", "2671.04"], 1659 ["0.99995934807202624", "3", "17403.31"], 1660 ["0.99992995739727453", "2", "26226.08"], 1661 ["0.99987185679245649", "1", "35213.81"], 1662 ["0.99957451563173938", "6", "653.84"], 1663 ["0.99907273650209583", "8", "31.85"], 1664 ["0.99905260312968946", "5", "679.29"], 1665 ["0.99730333650516401", "7", "354.56"], 1666 ["0.84093902435227808", "9", "8.23"], 1667 ["0.65650015926290028", "10", "1.79"]]; 1668 1669 /* Data sets for distinct sampling. */ 1670 string[][] data5x25 = 1671 [["ID", "Shape", "Color", "Size", "Weight"], 1672 ["01", "circle", "red", "S", "10"], 1673 ["02", "circle", "black", "L", "20"], 1674 ["03", "square", "black", "L", "20"], 1675 ["04", "circle", "green", "L", "30"], 1676 ["05", "ellipse", "red", "S", "20"], 1677 ["06", "triangle", "red", "S", "10"], 1678 ["07", "triangle", "red", "L", "20"], 1679 ["08", "square", "black", "S", "10"], 1680 ["09", "circle", "black", "S", "20"], 1681 ["10", "square", "green", "L", "20"], 1682 ["11", "triangle", "red", "L", "20"], 1683 ["12", "circle", "green", "L", "30"], 1684 ["13", "ellipse", "red", "S", "20"], 1685 ["14", "circle", "green", "L", "30"], 1686 ["15", "ellipse", "red", "L", "30"], 1687 ["16", "square", "red", "S", "10"], 1688 ["17", "circle", "black", "L", "20"], 1689 ["18", "square", "red", "S", "20"], 1690 ["19", "square", "black", "L", "20"], 1691 ["20", "circle", "red", "S", "10"], 1692 ["21", "ellipse", "black", "L", "30"], 1693 ["22", "triangle", "red", "L", "30"], 1694 ["23", "circle", "green", "S", "20"], 1695 ["24", "square", "green", "L", "20"], 1696 ["25", "circle", "red", "S", "10"], 1697 ]; 1698 1699 string fpath_data5x25 = buildPath(testDir, "data5x25.tsv"); 1700 string fpath_data5x25_noheader = buildPath(testDir, "data5x25_noheader.tsv"); 1701 writeUnittestTsvFile(fpath_data5x25, data5x25); 1702 writeUnittestTsvFile(fpath_data5x25_noheader, data5x25[1..$]); 1703 1704 string[][] data5x25ExpectedDistinctSampleK2P40 = 1705 [["ID", "Shape", "Color", "Size", "Weight"], 1706 ["03", "square", "black", "L", "20"], 1707 ["05", "ellipse", "red", "S", "20"], 1708 ["08", "square", "black", "S", "10"], 1709 ["10", "square", "green", "L", "20"], 1710 ["13", "ellipse", "red", "S", "20"], 1711 ["15", "ellipse", "red", "L", "30"], 1712 ["16", "square", "red", "S", "10"], 1713 ["18", "square", "red", "S", "20"], 1714 ["19", "square", "black", "L", "20"], 1715 ["21", "ellipse", "black", "L", "30"], 1716 ["24", "square", "green", "L", "20"], 1717 ]; 1718 1719 string[][] data5x25ExpectedDistinctSampleK2K4P20 = 1720 [["ID", "Shape", "Color", "Size", "Weight"], 1721 ["03", "square", "black", "L", "20"], 1722 ["07", "triangle", "red", "L", "20"], 1723 ["08", "square", "black", "S", "10"], 1724 ["10", "square", "green", "L", "20"], 1725 ["11", "triangle", "red", "L", "20"], 1726 ["16", "square", "red", "S", "10"], 1727 ["18", "square", "red", "S", "20"], 1728 ["19", "square", "black", "L", "20"], 1729 ["22", "triangle", "red", "L", "30"], 1730 ["24", "square", "green", "L", "20"], 1731 ]; 1732 1733 string[][] data5x25ExpectedDistinctSampleK2K3K4P20 = 1734 [["ID", "Shape", "Color", "Size", "Weight"], 1735 ["04", "circle", "green", "L", "30"], 1736 ["07", "triangle", "red", "L", "20"], 1737 ["09", "circle", "black", "S", "20"], 1738 ["11", "triangle", "red", "L", "20"], 1739 ["12", "circle", "green", "L", "30"], 1740 ["14", "circle", "green", "L", "30"], 1741 ["16", "square", "red", "S", "10"], 1742 ["18", "square", "red", "S", "20"], 1743 ["22", "triangle", "red", "L", "30"], 1744 ]; 1745 1746 /* 1747 * Enough setup! Actually run some tests! 1748 */ 1749 1750 /* Basic tests. Headers and static seed. With weights and without. */ 1751 testTsvSample(["test-a1", "--header", "--static-seed", fpath_dataEmpty], dataEmpty); 1752 testTsvSample(["test-a2", "--header", "--static-seed", fpath_data3x0], data3x0); 1753 testTsvSample(["test-a3", "-H", "-s", fpath_data3x1], data3x1); 1754 testTsvSample(["test-a4", "-H", "-s", fpath_data3x2], data3x2ExpectedNoWt); 1755 testTsvSample(["test-a5", "-H", "-s", fpath_data3x3], data3x3ExpectedNoWt); 1756 testTsvSample(["test-a6", "-H", "-s", fpath_data3x6], data3x6ExpectedNoWt); 1757 testTsvSample(["test-a7", "-H", "-s", "--print-random", fpath_data3x6], data3x6ExpectedNoWtProbs); 1758 testTsvSample(["test-a8", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedWt3); 1759 testTsvSample(["test-a9", "-H", "-s", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedWt3Probs); 1760 testTsvSample(["test-a10", "-H", "--seed-value", "41", "--print-random", fpath_data3x6], data3x6ExpectedNoWtV41Probs); 1761 testTsvSample(["test-a11", "-H", "-s", "-v", "41", "--print-random", fpath_data3x6], data3x6ExpectedNoWtV41Probs); 1762 testTsvSample(["test-a12", "-H", "-s", "-v", "0", "--print-random", fpath_data3x6], data3x6ExpectedNoWtProbs); 1763 testTsvSample(["test-a13", "-H", "-v", "41", "-w", "3", "--print-random", fpath_data3x6], data3x6ExpectedWt3V41Probs); 1764 1765 /* Bernoulli sampling cases. */ 1766 testTsvSample(["test-a14", "--header", "--static-seed", "--prob", "0.001", fpath_dataEmpty], dataEmpty); 1767 testTsvSample(["test-a15", "--header", "--static-seed", "--prob", "0.001", fpath_data3x0], data3x0); 1768 testTsvSample(["test-a16", "-H", "-s", "-p", "1.0", fpath_data3x1], data3x1); 1769 testTsvSample(["test-a17", "-H", "-s", "-p", "1.0", fpath_data3x6], data3x6); 1770 testTsvSample(["test-a18", "-H", "-p", "1.0", fpath_data3x6], data3x6); 1771 testTsvSample(["test-a19", "-H", "-s", "--prob", "1.0", "--print-random", fpath_data3x6], data3x6ExpectedProbsBernoulliSampleP100); 1772 testTsvSample(["test-a20", "-H", "-s", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedProbsBernoulliSampleP60); 1773 testTsvSample(["test-a21", "-H", "-s", "--prob", "0.60", fpath_data3x6], data3x6ExpectedBernoulliSampleP60); 1774 testTsvSample(["test-a22", "-H", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedV41ProbsBernoulliSampleP60); 1775 1776 /* Distinct sampling cases. */ 1777 testTsvSample(["test-a23", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_dataEmpty], dataEmpty); 1778 testTsvSample(["test-a24", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_data3x0], data3x0); 1779 testTsvSample(["test-a25", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x1], data3x1); 1780 testTsvSample(["test-a26", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x6], data3x6); 1781 testTsvSample(["test-a27", "-H", "-s", "-p", "0.6", "-k", "1,3", fpath_data3x6], data3x6ExpectedDistinctSampleK1K3P60); 1782 1783 /* Generating random weights. Use Bernoulli sampling test set at prob 100% for uniform sampling. 1784 * For weighted sampling, use the weighted cases, but with expected using the original ordering. 1785 */ 1786 testTsvSample(["test-a28", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedProbsBernoulliSampleP100); 1787 testTsvSample(["test-a29", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedProbsBernoulliSampleP100); 1788 testTsvSample(["test-a30", "-H", "-s", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6], 1789 data3x6ExpectedWt3ProbsInorder); 1790 testTsvSample(["test-a31", "-H", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6], 1791 data3x6ExpectedWt3V41ProbsInorder); 1792 testTsvSample(["test-a32", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6], 1793 data3x6ExpectedDistinctSampleK1K3P60Probs); 1794 testTsvSample(["test-a33", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", "--random-value-header", 1795 "custom_random_value_header", fpath_data3x6], data3x6ExpectedDistinctSampleK1K3P60ProbsRVCustom); 1796 testTsvSample(["test-a34", "-H", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6], 1797 data3x6ExpectedDistinctSampleK2P2ProbsInorder); 1798 1799 /* Simple random sampling with replacement. */ 1800 testTsvSample(["test-a35", "-H", "-s", "--replace", fpath_dataEmpty], dataEmpty); 1801 testTsvSample(["test-a36", "-H", "-s", "--replace", "--num", "3", fpath_dataEmpty], dataEmpty); 1802 testTsvSample(["test-a37", "-H", "-s", "--replace", fpath_data3x0], data3x0); 1803 testTsvSample(["test-a38", "-H", "-s", "--replace", "--num", "3", fpath_data3x0], data3x0); 1804 testTsvSample(["test-a39", "-H", "-s", "--replace", "--num", "3", fpath_data3x1], data3x1ExpectedReplace3); 1805 testTsvSample(["test-a40", "-H", "-s", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplace10); 1806 testTsvSample(["test-a41", "-H", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplace10V77); 1807 1808 /* Basic tests, without headers. */ 1809 testTsvSample(["test-b1", "-s", fpath_data3x1_noheader], data3x1[1..$]); 1810 testTsvSample(["test-b2", "-s", fpath_data3x2_noheader], data3x2ExpectedNoWt[1..$]); 1811 testTsvSample(["test-b3", "-s", fpath_data3x3_noheader], data3x3ExpectedNoWt[1..$]); 1812 testTsvSample(["test-b4", "-s", fpath_data3x6_noheader], data3x6ExpectedNoWt[1..$]); 1813 testTsvSample(["test-b5", "-s", "--print-random", fpath_data3x6_noheader], data3x6ExpectedNoWtProbs[1..$]); 1814 testTsvSample(["test-b6", "-s", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedWt3[1..$]); 1815 testTsvSample(["test-b7", "-s", "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedWt3Probs[1..$]); 1816 testTsvSample(["test-b8", "-v", "41", "--print-random", fpath_data3x6_noheader], data3x6ExpectedNoWtV41Probs[1..$]); 1817 testTsvSample(["test-b9", "-v", "41", "-w", "3", "--print-random", fpath_data3x6_noheader], data3x6ExpectedWt3V41Probs[1..$]); 1818 1819 /* Bernoulli sampling cases. */ 1820 testTsvSample(["test-b10", "-s", "-p", "1.0", fpath_data3x1_noheader], data3x1[1..$]); 1821 testTsvSample(["test-b11", "-s", "-p", "1.0", fpath_data3x6_noheader], data3x6[1..$]); 1822 testTsvSample(["test-b12", "-p", "1.0", fpath_data3x6_noheader], data3x6[1..$]); 1823 testTsvSample(["test-b13", "-s", "--prob", "1.0", "--print-random", fpath_data3x6_noheader], data3x6ExpectedProbsBernoulliSampleP100[1..$]); 1824 testTsvSample(["test-b14", "-s", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedProbsBernoulliSampleP60[1..$]); 1825 testTsvSample(["test-b15", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedV41ProbsBernoulliSampleP60[1..$]); 1826 1827 /* Distinct sampling cases. */ 1828 testTsvSample(["test-b16", "-s", "-p", "1.0", "-k", "2", fpath_data3x1_noheader], data3x1[1..$]); 1829 testTsvSample(["test-b17", "-s", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]); 1830 testTsvSample(["test-b18", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]); 1831 testTsvSample(["test-b19", "-v", "71563", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]); 1832 1833 /* Generating random weights. Reuse Bernoulli sampling tests at prob 100%. */ 1834 testTsvSample(["test-b20", "-s", "--gen-random-inorder", fpath_data3x6_noheader], data3x6ExpectedProbsBernoulliSampleP100[1..$]); 1835 testTsvSample(["test-b23", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedWt3V41ProbsInorder[1..$]); 1836 testTsvSample(["test-b24", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6_noheader], 1837 data3x6ExpectedDistinctSampleK1K3P60Probs[1..$]); 1838 testTsvSample(["test-b24", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6_noheader], 1839 data3x6ExpectedDistinctSampleK2P2ProbsInorder[1..$]); 1840 1841 /* Simple random sampling with replacement. */ 1842 testTsvSample(["test-b25", "-s", "--replace", fpath_dataEmpty], dataEmpty); 1843 testTsvSample(["test-b26", "-s", "-r", "--num", "3", fpath_dataEmpty], dataEmpty); 1844 testTsvSample(["test-b27", "-s", "-r", "-n", "3", fpath_data3x1_noheader], data3x1ExpectedReplace3[1..$]); 1845 testTsvSample(["test-b28", "-s", "--replace", "-n", "10", fpath_data3x6_noheader], data3x6ExpectedReplace10[1..$]); 1846 testTsvSample(["test-b29", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6_noheader], data3x6ExpectedReplace10V77[1..$]); 1847 1848 /* Multi-file tests. */ 1849 testTsvSample(["test-c1", "--header", "--static-seed", 1850 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 1851 combo1ExpectedNoWt); 1852 testTsvSample(["test-c2", "--header", "--static-seed", "--print-random", 1853 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 1854 combo1ExpectedNoWtProbs); 1855 testTsvSample(["test-c3", "--header", "--static-seed", "--print-random", "--weight-field", "3", 1856 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 1857 combo1ExpectedWt3Probs); 1858 testTsvSample(["test-c4", "--header", "--static-seed", "--weight-field", "3", 1859 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 1860 combo1ExpectedWt3); 1861 1862 /* Multi-file, no headers. */ 1863 testTsvSample(["test-c5", "--static-seed", 1864 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 1865 fpath_data3x6_noheader, fpath_data3x2_noheader], 1866 combo1ExpectedNoWt[1..$]); 1867 testTsvSample(["test-c6", "--static-seed", "--print-random", 1868 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 1869 fpath_data3x6_noheader, fpath_data3x2_noheader], 1870 combo1ExpectedNoWtProbs[1..$]); 1871 testTsvSample(["test-c7", "--static-seed", "--print-random", "--weight-field", "3", 1872 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 1873 fpath_data3x6_noheader, fpath_data3x2_noheader], 1874 combo1ExpectedWt3Probs[1..$]); 1875 testTsvSample(["test-c8", "--static-seed", "--weight-field", "3", 1876 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 1877 fpath_data3x6_noheader, fpath_data3x2_noheader], 1878 combo1ExpectedWt3[1..$]); 1879 1880 /* Bernoulli sampling cases. */ 1881 testTsvSample(["test-c9", "--header", "--static-seed", "--print-random", "--prob", ".5", 1882 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 1883 combo1ExpectedProbsBernoulliSampleP50); 1884 testTsvSample(["test-c10", "--header", "--static-seed", "--prob", ".4", 1885 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 1886 combo1ExpectedBernoulliSampleP40); 1887 testTsvSample(["test-c11", "--static-seed", "--print-random", "--prob", ".5", 1888 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 1889 fpath_data3x6_noheader, fpath_data3x2_noheader], 1890 combo1ExpectedProbsBernoulliSampleP50[1..$]); 1891 testTsvSample(["test-c12", "--static-seed", "--prob", ".4", 1892 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 1893 fpath_data3x6_noheader, fpath_data3x2_noheader], 1894 combo1ExpectedBernoulliSampleP40[1..$]); 1895 1896 /* Distinct sampling cases. */ 1897 testTsvSample(["test-c13", "--header", "--static-seed", "--key-fields", "1", "--prob", ".4", 1898 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 1899 combo1ExpectedDistinctSampleK1P40); 1900 testTsvSample(["test-c14", "--static-seed", "--key-fields", "1", "--prob", ".4", 1901 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 1902 fpath_data3x6_noheader, fpath_data3x2_noheader], 1903 combo1ExpectedDistinctSampleK1P40[1..$]); 1904 1905 /* Generating random weights. */ 1906 testTsvSample(["test-c15", "--header", "--static-seed", "--gen-random-inorder", 1907 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 1908 combo1ExpectedNoWtProbsInorder); 1909 testTsvSample(["test-c16", "--static-seed", "--gen-random-inorder", 1910 fpath_data3x3_noheader, fpath_data3x1_noheader, 1911 fpath_dataEmpty, fpath_data3x6_noheader, fpath_data3x2_noheader], 1912 combo1ExpectedNoWtProbsInorder[1..$]); 1913 1914 /* Simple random sampling with replacement. */ 1915 testTsvSample(["test-c17", "--header", "--static-seed", "--replace", "--num", "10", 1916 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 1917 combo1ExpectedReplace10); 1918 1919 testTsvSample(["test-c18", "--static-seed", "--replace", "--num", "10", 1920 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 1921 fpath_data3x6_noheader, fpath_data3x2_noheader], 1922 combo1ExpectedReplace10[1..$]); 1923 1924 /* Single column file. */ 1925 testTsvSample(["test-d1", "-H", "-s", fpath_data1x10], data1x10ExpectedNoWt); 1926 testTsvSample(["test-d1", "-H", "-s", fpath_data1x10], data1x10ExpectedNoWt); 1927 1928 /* Distributions. */ 1929 testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10a], data2x10aExpectedWt2Probs); 1930 testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10b], data2x10bExpectedWt2Probs); 1931 testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10c], data2x10cExpectedWt2Probs); 1932 testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10d], data2x10dExpectedWt2Probs); 1933 testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10e], data2x10eExpectedWt2Probs); 1934 1935 /* Tests of subset sample (--n|num) field. 1936 * 1937 * Note: The way these tests are done ensures that subset length does not affect 1938 * output order. 1939 */ 1940 import std.algorithm : min; 1941 for (size_t n = data3x6.length + 2; n >= 1; n--) 1942 { 1943 size_t expectedLength = min(data3x6.length, n + 1); 1944 testTsvSample([format("test-f1_%d", n), "-s", "-n", n.to!string, 1945 "-H", fpath_data3x6], data3x6ExpectedNoWt[0..expectedLength]); 1946 1947 testTsvSample([format("test-f2_%d", n), "-s", "-n", n.to!string, 1948 "-H", "--print-random", fpath_data3x6], data3x6ExpectedNoWtProbs[0..expectedLength]); 1949 1950 testTsvSample([format("test-f3_%d", n), "-s", "-n", n.to!string, 1951 "-H", "-w", "3", fpath_data3x6], data3x6ExpectedWt3[0..expectedLength]); 1952 1953 testTsvSample([format("test-f4_%d", n), "-s", "-n", n.to!string, 1954 "-H", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedWt3Probs[0..expectedLength]); 1955 1956 testTsvSample([format("test-f5_%d", n), "-s", "-n", n.to!string, 1957 fpath_data3x6_noheader], data3x6ExpectedNoWt[1..expectedLength]); 1958 1959 testTsvSample([format("test-f6_%d", n), "-s", "-n", n.to!string, 1960 "--print-random", fpath_data3x6_noheader], data3x6ExpectedNoWtProbs[1..expectedLength]); 1961 1962 testTsvSample([format("test-f7_%d", n), "-s", "-n", n.to!string, 1963 "-w", "3", fpath_data3x6_noheader], data3x6ExpectedWt3[1..expectedLength]); 1964 1965 testTsvSample([format("test-f8_%d", n), "-s", "-n", n.to!string, 1966 "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedWt3Probs[1..expectedLength]); 1967 1968 import std.algorithm : min; 1969 size_t sampleExpectedLength = min(expectedLength, data3x6ExpectedProbsBernoulliSampleP60.length); 1970 1971 testTsvSample([format("test-f9_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 1972 "-H", "--print-random", fpath_data3x6], data3x6ExpectedProbsBernoulliSampleP60[0..sampleExpectedLength]); 1973 1974 testTsvSample([format("test-f10_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 1975 "-H", fpath_data3x6], data3x6ExpectedBernoulliSampleP60[0..sampleExpectedLength]); 1976 1977 testTsvSample([format("test-f11_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 1978 "--print-random", fpath_data3x6_noheader], data3x6ExpectedProbsBernoulliSampleP60[1..sampleExpectedLength]); 1979 1980 testTsvSample([format("test-f12_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 1981 fpath_data3x6_noheader], data3x6ExpectedBernoulliSampleP60[1..sampleExpectedLength]); 1982 1983 size_t distinctExpectedLength = min(expectedLength, data3x6ExpectedDistinctSampleK1K3P60.length); 1984 1985 testTsvSample([format("test-f13_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string, 1986 "-H", fpath_data3x6], data3x6ExpectedDistinctSampleK1K3P60[0..distinctExpectedLength]); 1987 1988 testTsvSample([format("test-f14_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string, 1989 fpath_data3x6_noheader], data3x6ExpectedDistinctSampleK1K3P60[1..distinctExpectedLength]); 1990 1991 testTsvSample([format("test-f15_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string, 1992 "-H", fpath_data3x6], data3x6ExpectedProbsBernoulliSampleP100[0..expectedLength]); 1993 1994 testTsvSample([format("test-f15_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string, 1995 fpath_data3x6_noheader], data3x6ExpectedProbsBernoulliSampleP100[1..expectedLength]); 1996 } 1997 1998 /* Similar tests with the 1x10 data set. */ 1999 for (size_t n = data1x10.length + 2; n >= 1; n--) 2000 { 2001 size_t expectedLength = min(data1x10.length, n + 1); 2002 testTsvSample([format("test-g1_%d", n), "-s", "-n", n.to!string, 2003 "-H", fpath_data1x10], data1x10ExpectedNoWt[0..expectedLength]); 2004 2005 testTsvSample([format("test-g2_%d", n), "-s", "-n", n.to!string, 2006 "-H", "-w", "1", fpath_data1x10], data1x10ExpectedWt1[0..expectedLength]); 2007 2008 testTsvSample([format("test-g3_%d", n), "-s", "-n", n.to!string, 2009 fpath_data1x10_noheader], data1x10ExpectedNoWt[1..expectedLength]); 2010 2011 testTsvSample([format("test-g4_%d", n), "-s", "-n", n.to!string, 2012 "-w", "1", fpath_data1x10_noheader], data1x10ExpectedWt1[1..expectedLength]); 2013 } 2014 2015 /* Simple random sampling with replacement: ensure sample size doesn't change order. */ 2016 for (size_t n = data3x6ExpectedReplace10.length - 1; n >= 1; n--) 2017 { 2018 testTsvSample([format("test-h1_%d", n), "-s", "--replace", "-n", n.to!string, "-H", fpath_data3x6], 2019 data3x6ExpectedReplace10[0 .. n + 1]); 2020 2021 testTsvSample([format("test-h2_%d", n), "-s", "--replace", "-n", n.to!string, fpath_data3x6_noheader], 2022 data3x6ExpectedReplace10[1 .. n + 1]); 2023 } 2024 2025 /* Distinct sampling tests. */ 2026 testTsvSample(["test-i1", "--header", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25], 2027 data5x25ExpectedDistinctSampleK2P40); 2028 2029 testTsvSample(["test-i2", "-H", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25], 2030 data5x25ExpectedDistinctSampleK2K4P20); 2031 2032 testTsvSample(["test-i3", "-H", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25], 2033 data5x25ExpectedDistinctSampleK2K3K4P20); 2034 2035 testTsvSample(["test-i4", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25_noheader], 2036 data5x25ExpectedDistinctSampleK2P40[1..$]); 2037 2038 testTsvSample(["test-i5", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25_noheader], 2039 data5x25ExpectedDistinctSampleK2K4P20[1..$]); 2040 2041 testTsvSample(["test-i6", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25_noheader], 2042 data5x25ExpectedDistinctSampleK2K3K4P20[1..$]); 2043 }