1 /** 2 Command line tool for randomizing or sampling lines from input streams. Several 3 sampling methods are available, including simple random sampling, weighted random 4 sampling, Bernoulli sampling, and distinct sampling. 5 6 Copyright (c) 2017-2018, eBay Software Foundation 7 Initially written by Jon Degenhardt 8 9 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 10 */ 11 module tsv_sample; 12 13 import std.range; 14 import std.stdio; 15 import std.typecons : tuple, Flag; 16 17 version(unittest) 18 { 19 // When running unit tests, use main from -main compiler switch. 20 } 21 else 22 { 23 int main(string[] cmdArgs) 24 { 25 /* When running in DMD code coverage mode, turn on report merging. */ 26 version(D_Coverage) version(DigitalMars) 27 { 28 import core.runtime : dmd_coverSetMerge; 29 dmd_coverSetMerge(true); 30 } 31 32 TsvSampleOptions cmdopt; 33 auto r = cmdopt.processArgs(cmdArgs); 34 if (!r[0]) return r[1]; 35 version(LDC_Profile) 36 { 37 import ldc.profile : resetAll; 38 resetAll(); 39 } 40 try 41 { 42 import tsvutil : BufferedOutputRange; 43 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 44 45 tsvSample(cmdopt, bufferedOutput); 46 } 47 catch (Exception exc) 48 { 49 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 50 return 1; 51 } 52 return 0; 53 } 54 } 55 56 auto helpText = q"EOS 57 Synopsis: tsv-sample [options] [file...] 58 59 Sample input lines or randomize their order. Several modes of operation 60 are available: 61 * Line order randomization (the default): All input lines are output in a 62 random order. All orderings are equally likely. 63 * Weighted line order randomization (--w|weight-field): Lines are selected 64 using weighted random sampling, with the weight taken from a field. 65 Lines are output in weighted selection order, reordering the lines. 66 * Sampling with replacement (--r|replace, --n|num): All input is read into 67 memory, then lines are repeatedly selected at random and written out. This 68 continues until --n|num samples are output. Lines can be selected multiple 69 times. Output continues forever if --n|num is zero or not specified. 70 * Bernoulli sampling (--p|prob): A random subset of lines is output based 71 on an inclusion probability. This is a streaming operation. A selection 72 decision is made on each line as is it read. Line order is not changed. 73 * Distinct sampling (--k|key-fields, --p|prob): Input lines are sampled 74 based on the values in the key field. A subset of the keys are chosen 75 based on the inclusion probability (a 'distinct' set of keys). All lines 76 with one of the selected keys are output. Line order is not changed. 77 78 The '--n|num' option limits the sample size produced. It speeds up line 79 order randomization and weighted sampling significantly. It is also used 80 to terminate sampling with replacement. 81 82 Use '--help-verbose' for detailed information. 83 84 Options: 85 EOS"; 86 87 auto helpTextVerbose = q"EOS 88 Synopsis: tsv-sample [options] [file...] 89 90 Sample input lines or randomize their order. Several modes of operation 91 are available: 92 * Line order randomization (the default): All input lines are output in a 93 random order. All orderings are equally likely. 94 * Weighted line order randomization (--w|weight-field): Lines are selected 95 using weighted random sampling, with the weight taken from a field. 96 Lines are output in weighted selection order, reordering the lines. 97 * Sampling with replacement (--r|replace, --n|num): All input is read into 98 memory, then lines are repeatedly selected at random and written out. This 99 continues until --n|num samples are output. Lines can be selected multiple 100 times. Output continues forever if --n|num is zero or not specified. 101 * Bernoulli sampling (--p|prob): A random subset of lines is output based 102 on an inclusion probability. This is a streaming operation. A selection 103 decision is made on each line as is it read. Lines order is not changed. 104 * Distinct sampling (--k|key-fields, --p|prob): Input lines are sampled 105 based on the values in the key field. A subset of the keys are chosen 106 based on the inclusion probability (a 'distinct' set of keys). All lines 107 with one of the selected keys are output. Line order is not changed. 108 109 Sample size: The '--n|num' option limits the sample size produced. This 110 speeds up line order randomization and weighted sampling significantly 111 (details below). It is also used to terminate sampling with replacement. 112 113 Controlling the random seed: By default, each run produces a different 114 randomization or sampling. Using '--s|static-seed' changes this so 115 multiple runs produce the same results. This works by using the same 116 random seed each run. The random seed can be specified using 117 '--v|seed-value'. This takes a non-zero, 32-bit positive integer. (A zero 118 value is a no-op and ignored.) 119 120 Memory use: Bernoulli sampling and distinct sampling make decisions on 121 each line as it is read, so there is no memory accumulation. These 122 algorithms support arbitrary size inputs. Sampling with replacement reads 123 all lines into memory and is limited by available memory. The line order 124 randomization algorithms hold the full output set in memory prior to 125 generating results. This ultimately limits the size of the output set. For 126 these memory needs can be reduced by using a sample size (--n|num). This 127 engages reservior sampling. Output order is not affected. Both 128 'tsv-sample -n 1000' and 'tsv-sample | head -n 1000' produce the same 129 results, but the former is quite a bit faster. 130 131 Weighted sampling: Weighted random sampling is done using an algorithm 132 described by Pavlos Efraimidis and Paul Spirakis. Weights should be 133 positive values representing the relative weight of the entry in the 134 collection. Counts and similar can be used as weights, it is *not* 135 necessary to normalize to a [0,1] interval. Negative values are not 136 meaningful and given the value zero. Input order is not retained, instead 137 lines are output ordered by the randomized weight that was assigned. This 138 means that a smaller valid sample can be produced by taking the first N 139 lines of output. For more info on the sampling approach see: 140 * Wikipedia: https://en.wikipedia.org/wiki/Reservoir_sampling 141 * "Weighted Random Sampling over Data Streams", Pavlos S. Efraimidis 142 (https://arxiv.org/abs/1012.0256) 143 144 Printing random values: Most of the sampling algorithms work by generating 145 a random value for each line. (See "Compatibility mode" below.) The nature 146 of these values depends on the sampling algorithm. They are used for both 147 line selection and output ordering. The '--p|print-random' option can be 148 used to print these values. The random value is prepended to the line 149 separated by the --d|delimiter char (TAB by default). The 150 '--q|gen-random-inorder' option takes this one step further, generating 151 random values for all input lines without changing the input order. The 152 types of values currently used by these sampling algorithms: 153 * Unweighted sampling: Uniform random value in the interval [0,1]. This 154 includes Bernoulli sampling and unweighted line order randomization. 155 * Weighted sampling: Value in the interval [0,1]. Distribution depends on 156 the values in the weight field. It is used as a partial ordering. 157 * Distinct sampling: An integer, zero and up, representing a selection 158 group. The inclusion probability determines the number of selection groups. 159 * Sampling with replacement: Random value printing is not supported. 160 161 The specifics behind these random values are subject to change in future 162 releases. 163 164 Compatibility mode: As described above, many of the sampling algorithms 165 assign a random value to each line. This is useful when printing random 166 values. It has another occasionally useful property: repeated runs with 167 the same static seed but different selection parameters are more 168 compatible with each other, as each line gets assigned the same random 169 value on every run. For example, if Bernoulli sampling is run with 170 '--prob 0.2 --static-seed', then run again with '--prob 0.3 --static-seed', 171 all the lines selected in the first run will be selected in the second. 172 This comes at a cost: in some cases there are faster algorithms that don't 173 preserve this property. By default, tsv-sample will use faster algorithms 174 when available. However, the '--compatibility-mode' option switches to 175 algorithms that assign a random value per line. Printing random values 176 also engages compatibility mode. 177 178 Options: 179 EOS"; 180 181 /** Container for command line options. 182 */ 183 struct TsvSampleOptions 184 { 185 string programName; 186 string[] files; 187 bool helpVerbose = false; // --help-verbose 188 bool hasHeader = false; // --H|header 189 size_t sampleSize = 0; // --n|num - Size of the desired sample 190 double inclusionProbability = double.nan; // --p|prob - Inclusion probability 191 size_t[] keyFields; // --k|key-fields - Used with inclusion probability 192 size_t weightField = 0; // --w|weight-field - Field holding the weight 193 bool srsWithReplacement = false; // --r|replace 194 bool staticSeed = false; // --s|static-seed 195 uint seedValueOptionArg = 0; // --v|seed-value 196 bool printRandom = false; // --print-random 197 bool genRandomInorder = false; // --gen-random-inorder 198 string randomValueHeader = "random_value"; // --random-value-header 199 bool compatibilityMode = false; // --compatibility-mode 200 char delim = '\t'; // --d|delimiter 201 bool versionWanted = false; // --V|version 202 bool preferSkipSampling = false; // --prefer-skip-sampling 203 bool preferAlgorithmR = false; // --prefer-algorithm-r 204 bool hasWeightField = false; // Derived. 205 bool useBernoulliSampling = false; // Derived. 206 bool useDistinctSampling = false; // Derived. 207 bool usingUnpredictableSeed = true; // Derived from --static-seed, --seed-value 208 uint seed = 0; // Derived from --static-seed, --seed-value 209 210 auto processArgs(ref string[] cmdArgs) 211 { 212 import std.algorithm : canFind; 213 import std.getopt; 214 import std.math : isNaN; 215 import std.path : baseName, stripExtension; 216 import std.typecons : Yes, No; 217 import tsvutil : makeFieldListOptionHandler; 218 219 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 220 221 try 222 { 223 arraySep = ","; // Use comma to separate values in command line options 224 auto r = getopt( 225 cmdArgs, 226 "help-verbose", " Print more detailed help.", &helpVerbose, 227 228 std.getopt.config.caseSensitive, 229 "H|header", " Treat the first line of each file as a header.", &hasHeader, 230 std.getopt.config.caseInsensitive, 231 232 "n|num", "NUM Maximim number of lines to output. All selected lines are output if not provided or zero.", &sampleSize, 233 "p|prob", "NUM Inclusion probability (0.0 < NUM <= 1.0). For Bernoulli sampling, the probability each line is selected output. For distinct sampling, the probability each unique key is selected for output.", &inclusionProbability, 234 235 "k|key-fields", "<field-list> Fields to use as key for distinct sampling. Use with --p|prob.", 236 keyFields.makeFieldListOptionHandler!(size_t, Yes.convertToZeroBasedIndex), 237 238 "w|weight-field", "NUM Field containing weights. All lines get equal weight if not provided or zero.", &weightField, 239 "r|replace", " Simple random sampling with replacement. Use --n|num to specify the sample size.", &srsWithReplacement, 240 "s|static-seed", " Use the same random seed every run.", &staticSeed, 241 242 std.getopt.config.caseSensitive, 243 "v|seed-value", "NUM Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg, 244 std.getopt.config.caseInsensitive, 245 246 "print-random", " Include the assigned random value (prepended) when writing output lines.", &printRandom, 247 "gen-random-inorder", " Output all lines with assigned random values prepended, no changes to the order of input.", &genRandomInorder, 248 "random-value-header", " Header to use with --print-random and --gen-random-inorder. Default: 'random_value'.", &randomValueHeader, 249 "compatibility-mode", " Turns on 'compatibility-mode'. Use --help-verbose for information.", &compatibilityMode, 250 251 "d|delimiter", "CHR Field delimiter.", &delim, 252 253 std.getopt.config.caseSensitive, 254 "V|version", " Print version information and exit.", &versionWanted, 255 std.getopt.config.caseInsensitive, 256 257 "prefer-skip-sampling", " (Internal) Prefer the skip-sampling algorithm for Bernoulli sampling. Used for testing and diagnostics.", 258 &preferSkipSampling, 259 260 "prefer-algorithm-r", " (Internal) Prefer Algorithm R for unweighted line order randomization. Used for testing and diagnostics.", 261 &preferAlgorithmR, 262 ); 263 264 if (r.helpWanted) 265 { 266 defaultGetoptPrinter(helpText, r.options); 267 return tuple(false, 0); 268 } 269 else if (helpVerbose) 270 { 271 defaultGetoptPrinter(helpTextVerbose, r.options); 272 return tuple(false, 0); 273 } 274 else if (versionWanted) 275 { 276 import tsvutils_version; 277 writeln(tsvutilsVersionNotice("tsv-sample")); 278 return tuple(false, 0); 279 } 280 281 /* Derivations and validations. */ 282 if (weightField > 0) 283 { 284 hasWeightField = true; 285 weightField--; // Switch to zero-based indexes. 286 } 287 288 if (srsWithReplacement) 289 { 290 if (hasWeightField) 291 { 292 throw new Exception("Sampling with replacement (--r|replace) does not support wieghts (--w|weight-field)."); 293 } 294 else if (!inclusionProbability.isNaN) 295 { 296 throw new Exception("Sampling with replacement (--r|replace) cannot be used with probabilities (--p|prob)."); 297 } 298 else if (keyFields.length > 0) 299 { 300 throw new Exception("Sampling with replacement (--r|replace) cannot be used with distinct sampling (--k|key-fields)."); 301 } 302 else if (printRandom || genRandomInorder) 303 { 304 throw new Exception("Sampling with replacement (--r|replace) does not support random value printing (--print-random, --gen-random-inorder)."); 305 } 306 } 307 308 if (keyFields.length > 0) 309 { 310 if (inclusionProbability.isNaN) throw new Exception("--p|prob is required when using --k|key-fields."); 311 } 312 313 /* Inclusion probability (--p|prob) is used for both Bernoulli sampling and distinct sampling. */ 314 if (!inclusionProbability.isNaN) 315 { 316 if (inclusionProbability <= 0.0 || inclusionProbability > 1.0) 317 { 318 import std.format : format; 319 throw new Exception( 320 format("Invalid --p|prob option: %g. Must satisfy 0.0 < prob <= 1.0.", inclusionProbability)); 321 } 322 323 if (keyFields.length > 0) useDistinctSampling = true; 324 else useBernoulliSampling = true; 325 326 if (hasWeightField) throw new Exception("--w|weight-field and --p|prob cannot be used together."); 327 328 if (genRandomInorder && !useDistinctSampling) 329 { 330 throw new Exception("--q|gen-random-inorder and --p|prob can only be used together if --k|key-fields is also used."); 331 } 332 } 333 else if (genRandomInorder && !hasWeightField) 334 { 335 useBernoulliSampling = true; 336 } 337 338 if (randomValueHeader.length == 0 || randomValueHeader.canFind('\n') || 339 randomValueHeader.canFind(delim)) 340 { 341 throw new Exception("--randomValueHeader must be at least one character and not contain field delimiters or newlines."); 342 } 343 344 /* Random value printing implies compatibility-mode, otherwise user's selection is used. */ 345 if (printRandom || genRandomInorder) compatibilityMode = true; 346 347 /* Seed. */ 348 import std.random : unpredictableSeed; 349 350 usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0); 351 352 if (usingUnpredictableSeed) seed = unpredictableSeed; 353 else if (seedValueOptionArg != 0) seed = seedValueOptionArg; 354 else if (staticSeed) seed = 2438424139; 355 else assert(0, "Internal error, invalid seed option states."); 356 357 /* Assume remaining args are files. Use standard input if files were not provided. */ 358 files ~= (cmdArgs.length > 1) ? cmdArgs[1..$] : ["-"]; 359 cmdArgs.length = 1; 360 } 361 catch (Exception exc) 362 { 363 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 364 return tuple(false, 1); 365 } 366 return tuple(true, 0); 367 } 368 } 369 /** Invokes the appropriate sampling routine based on the command line arguments. 370 */ 371 void tsvSample(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 372 if (isOutputRange!(OutputRange, char)) 373 { 374 if (cmdopt.srsWithReplacement) 375 { 376 simpleRandomSamplingWithReplacement(cmdopt, outputStream); 377 } 378 else if (cmdopt.useBernoulliSampling) 379 { 380 bernoulliSamplingCommand(cmdopt, outputStream); 381 } 382 else if (cmdopt.useDistinctSampling) 383 { 384 if (cmdopt.genRandomInorder) distinctSampling!(Yes.generateRandomAll)(cmdopt, outputStream); 385 else distinctSampling!(No.generateRandomAll)(cmdopt, outputStream); 386 } 387 else if (cmdopt.genRandomInorder) 388 { 389 /* Note that the preceeding cases handle gen-random-inorder themselves (Bernoulli, 390 * Distinct), or don't handle it (SRS w/ Replacement). 391 */ 392 assert(cmdopt.hasWeightField); 393 generateWeightedRandomValuesInorder(cmdopt, outputStream); 394 } 395 else if (cmdopt.sampleSize != 0) 396 { 397 reservoirSamplingCommand(cmdopt, outputStream); 398 } 399 else 400 { 401 randomizeLinesCommand(cmdopt, outputStream); 402 } 403 } 404 405 /** Bernoulli sampling on the input stream. 406 * 407 * This routine selects the appropriate bernoulli sampling function and template 408 * instantiation to use based on the command line arguments. 409 * 410 * See the bernoulliSkipSampling routine for a discussion of the choices behind the 411 * skipSamplingProbabilityThreshold used here. 412 */ 413 void bernoulliSamplingCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 414 if (isOutputRange!(OutputRange, char)) 415 { 416 assert(!cmdopt.hasWeightField); 417 418 immutable double skipSamplingProbabilityThreshold = 0.04; 419 420 if (cmdopt.compatibilityMode || 421 (cmdopt.inclusionProbability > skipSamplingProbabilityThreshold && !cmdopt.preferSkipSampling)) 422 { 423 if (cmdopt.genRandomInorder) 424 { 425 bernoulliSampling!(Yes.generateRandomAll)(cmdopt, outputStream); 426 } 427 else 428 { 429 bernoulliSampling!(No.generateRandomAll)(cmdopt, outputStream); 430 } 431 } 432 else 433 { 434 bernoulliSkipSampling(cmdopt, outputStream); 435 } 436 } 437 438 /** Bernoulli sampling on the input stream. 439 * 440 * Each input line is a assigned a random value and output if less than 441 * cmdopt.inclusionProbability. The order of the lines is not changed. 442 * 443 * This routine supports random value printing and gen-random-inorder value printing. 444 */ 445 void bernoulliSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange) 446 (TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 447 if (isOutputRange!(OutputRange, char)) 448 { 449 import std.format : formatValue, singleSpec; 450 import std.random : Random = Mt19937, uniform01; 451 import tsvutil : throwIfWindowsNewlineOnUnix; 452 453 static if (generateRandomAll) assert(cmdopt.genRandomInorder); 454 else assert(!cmdopt.genRandomInorder); 455 456 auto randomGenerator = Random(cmdopt.seed); 457 immutable randomValueFormatSpec = singleSpec("%.17g"); 458 459 /* Process each line. */ 460 bool headerWritten = false; 461 size_t numLinesWritten = 0; 462 foreach (filename; cmdopt.files) 463 { 464 auto inputStream = (filename == "-") ? stdin : filename.File(); 465 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 466 { 467 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 468 if (fileLineNum == 1 && cmdopt.hasHeader) 469 { 470 if (!headerWritten) 471 { 472 static if (generateRandomAll) 473 { 474 outputStream.put(cmdopt.randomValueHeader); 475 outputStream.put(cmdopt.delim); 476 } 477 else if (cmdopt.printRandom) 478 { 479 outputStream.put(cmdopt.randomValueHeader); 480 outputStream.put(cmdopt.delim); 481 } 482 483 outputStream.put(line); 484 outputStream.put("\n"); 485 headerWritten = true; 486 } 487 } 488 else 489 { 490 double lineScore = uniform01(randomGenerator); 491 492 static if (generateRandomAll) 493 { 494 outputStream.formatValue(lineScore, randomValueFormatSpec); 495 outputStream.put(cmdopt.delim); 496 outputStream.put(line); 497 outputStream.put("\n"); 498 499 if (cmdopt.sampleSize != 0) 500 { 501 ++numLinesWritten; 502 if (numLinesWritten == cmdopt.sampleSize) return; 503 } 504 } 505 else if (lineScore < cmdopt.inclusionProbability) 506 { 507 if (cmdopt.printRandom) 508 { 509 outputStream.formatValue(lineScore, randomValueFormatSpec); 510 outputStream.put(cmdopt.delim); 511 } 512 outputStream.put(line); 513 outputStream.put("\n"); 514 515 if (cmdopt.sampleSize != 0) 516 { 517 ++numLinesWritten; 518 if (numLinesWritten == cmdopt.sampleSize) return; 519 } 520 } 521 } 522 } 523 } 524 } 525 526 /* bernoulliSkipSampling is an alternate implementation of bernoulliSampling that 527 * uses skip sampling. 528 * 529 * Skip sampling works by skipping a random number of lines between selections. This 530 * can be faster than assigning a random value to each line when the inclusion 531 * probability is low, as it reduces the number of calls to the random number 532 * generator. Both the random number generator and the log() function as called when 533 * calculating the next skip size. These additional log() calls add up as the 534 * probability increases. 535 * 536 * Performance tests indicate the break-even point is about 4-5% (--prob 0.04) for 537 * file-oriented line sampling. This is obviously environment specific. In the 538 * environments this implementation has been tested in the perfmance improvements 539 * remain small, less than 7%, even with an inclusion probability as low as 0.0001. 540 * 541 * The algorithm does not assign random values to individual lines. This makes it 542 * incompatible with random value printing. It is not suitable for compatibility mode 543 * either. As an example, in compatibility mode a line selected with '--prob 0.2' should 544 * also be selected with '--prob 0.3' (assuming the same random seed). Skip sampling 545 * does not have this property. 546 * 547 * The algorithm for calculating the skip size has been described by multiple sources. 548 * There are two key variants depending on whether the total number of lines in the 549 * data set is known in advance. (This implementation does not know the total.) 550 * Useful references: 551 * - Jeffrey Scott Vitter, "An Efficient Algorithm for Sequential Random Sampling", 552 * ACM Trans on Mathematical Software, 1987. On-line: 553 * http://www.ittc.ku.edu/~jsv/Papers/Vit87.RandomSampling.pdf 554 * - P.J. Haas, "Data-Stream Sampling: Basic Techniques and Results", from the book 555 * "Data Stream Management", Springer-Verlag, 2016. On-line: 556 * https://www.springer.com/cda/content/document/cda_downloaddocument/9783540286073-c2.pdf 557 * - Erik Erlandson, "Faster Random Samples With Gap Sampling", 2014. On-line: 558 * http://erikerlandson.github.io/blog/2014/09/11/faster-random-samples-with-gap-sampling/ 559 */ 560 void bernoulliSkipSampling(OutputRange)(TsvSampleOptions cmdopt, OutputRange outputStream) 561 if (isOutputRange!(OutputRange, char)) 562 { 563 import std.conv : to; 564 import std.math : log, trunc; 565 import std.random : Random = Mt19937, uniform01; 566 import tsvutil : throwIfWindowsNewlineOnUnix; 567 568 assert(cmdopt.inclusionProbability > 0.0 && cmdopt.inclusionProbability < 1.0); 569 assert(!cmdopt.printRandom); 570 assert(!cmdopt.compatibilityMode); 571 572 auto randomGenerator = Random(cmdopt.seed); 573 574 immutable double discardRate = 1.0 - cmdopt.inclusionProbability; 575 immutable double logDiscardRate = log(discardRate); 576 577 /* Note: The '1.0 - uniform01(randomGenerator)' expression flips the half closed 578 * interval to (0.0, 1.0], excluding 0.0. 579 */ 580 size_t remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t; 581 582 /* Process each line. */ 583 bool headerWritten = false; 584 size_t numLinesWritten = 0; 585 foreach (filename; cmdopt.files) 586 { 587 auto inputStream = (filename == "-") ? stdin : filename.File(); 588 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 589 { 590 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 591 if (fileLineNum == 1 && cmdopt.hasHeader) 592 { 593 if (!headerWritten) 594 { 595 outputStream.put(line); 596 outputStream.put("\n"); 597 headerWritten = true; 598 } 599 } 600 else if (remainingSkips > 0) 601 { 602 --remainingSkips; 603 } 604 else 605 { 606 outputStream.put(line); 607 outputStream.put("\n"); 608 609 if (cmdopt.sampleSize != 0) 610 { 611 ++numLinesWritten; 612 if (numLinesWritten == cmdopt.sampleSize) return; 613 } 614 615 remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t; 616 } 617 } 618 } 619 } 620 621 /** Sample a subset of the unique values from the key fields. 622 * 623 * Distinct sampling is done by hashing the key and mapping the hash value into 624 * buckets matching the inclusion probability. Records having a key mapping to bucket 625 * zero are output. 626 * 627 * TODO: Add whole line as key. 628 */ 629 void distinctSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange) 630 (TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 631 if (isOutputRange!(OutputRange, char)) 632 { 633 import std.algorithm : splitter; 634 import std.conv : to; 635 import std.digest.murmurhash; 636 import std.math : lrint; 637 import tsvutil : InputFieldReordering, throwIfWindowsNewlineOnUnix; 638 639 static if (generateRandomAll) assert(cmdopt.genRandomInorder); 640 else assert(!cmdopt.genRandomInorder); 641 642 assert(cmdopt.keyFields.length > 0); 643 assert(0.0 < cmdopt.inclusionProbability && cmdopt.inclusionProbability <= 1.0); 644 645 static if (generateRandomAll) 646 { 647 import std.format : formatValue, singleSpec; 648 immutable randomValueFormatSpec = singleSpec("%d"); 649 } 650 651 immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys. 652 653 uint numBuckets = (1.0 / cmdopt.inclusionProbability).lrint.to!uint; 654 655 /* Create a mapping for the key fields. */ 656 auto keyFieldsReordering = new InputFieldReordering!char(cmdopt.keyFields); 657 658 /* Process each line. */ 659 bool headerWritten = false; 660 size_t numLinesWritten = 0; 661 foreach (filename; cmdopt.files) 662 { 663 auto inputStream = (filename == "-") ? stdin : filename.File(); 664 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 665 { 666 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 667 if (fileLineNum == 1 && cmdopt.hasHeader) 668 { 669 if (!headerWritten) 670 { 671 static if (generateRandomAll) 672 { 673 outputStream.put(cmdopt.randomValueHeader); 674 outputStream.put(cmdopt.delim); 675 } 676 else if (cmdopt.printRandom) 677 { 678 outputStream.put(cmdopt.randomValueHeader); 679 outputStream.put(cmdopt.delim); 680 } 681 682 outputStream.put(line); 683 outputStream.put("\n"); 684 headerWritten = true; 685 } 686 } 687 else 688 { 689 /* Gather the key field values and assemble the key. */ 690 keyFieldsReordering.initNewLine; 691 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 692 { 693 keyFieldsReordering.processNextField(fieldIndex, fieldValue); 694 if (keyFieldsReordering.allFieldsFilled) break; 695 } 696 697 if (!keyFieldsReordering.allFieldsFilled) 698 { 699 import std.format : format; 700 throw new Exception( 701 format("Not enough fields in line. File: %s, Line: %s", 702 (filename == "-") ? "Standard Input" : filename, fileLineNum)); 703 } 704 705 auto hasher = MurmurHash3!32(cmdopt.seed); 706 foreach (count, key; keyFieldsReordering.outputFields.enumerate) 707 { 708 if (count > 0) hasher.put(delimArray); 709 hasher.put(cast(ubyte[]) key); 710 } 711 hasher.finish; 712 713 static if (generateRandomAll) 714 { 715 import std.conv : to; 716 outputStream.formatValue(hasher.get % numBuckets, randomValueFormatSpec); 717 outputStream.put(cmdopt.delim); 718 outputStream.put(line); 719 outputStream.put("\n"); 720 721 if (cmdopt.sampleSize != 0) 722 { 723 ++numLinesWritten; 724 if (numLinesWritten == cmdopt.sampleSize) return; 725 } 726 } 727 else if (hasher.get % numBuckets == 0) 728 { 729 if (cmdopt.printRandom) 730 { 731 outputStream.put('0'); 732 outputStream.put(cmdopt.delim); 733 } 734 outputStream.put(line); 735 outputStream.put("\n"); 736 737 if (cmdopt.sampleSize != 0) 738 { 739 ++numLinesWritten; 740 if (numLinesWritten == cmdopt.sampleSize) return; 741 } 742 } 743 } 744 } 745 } 746 } 747 748 /** Reservoir sampling on the input stream. 749 * 750 * This routine selects the appropriate reservior sampling function and template 751 * instantiation to use based on the command line arguments. 752 * 753 * Reservoir sampling is used when a fixed size sample is being pulled from an input 754 * stream. Weighted and unweighted sampling is supported. These routines also 755 * randomize the order of the selected lines. This is consistent with line order 756 * randomization of the entire input stream (handled by randomizeLinesCommand). 757 * 758 * For unweighted sampling, there is a performance tradeoff choice between the two 759 * available implementations. See the reservoirSampling documentation for 760 * information. The threshold used here was chosen based on performance tests. 761 */ 762 763 void reservoirSamplingCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 764 if (isOutputRange!(OutputRange, char)) 765 { 766 assert(cmdopt.sampleSize != 0); 767 768 immutable size_t algorithmRSampleSizeThreshold = 128 * 1024; 769 770 if (cmdopt.hasWeightField) 771 { 772 reservoirSamplingViaHeap!(Yes.isWeighted)(cmdopt, outputStream); 773 } 774 else if (cmdopt.compatibilityMode || 775 (cmdopt.sampleSize < algorithmRSampleSizeThreshold && !cmdopt.preferAlgorithmR)) 776 { 777 reservoirSamplingViaHeap!(No.isWeighted)(cmdopt, outputStream); 778 } 779 else 780 { 781 reservoirSamplingAlgorithmR(cmdopt, outputStream); 782 } 783 } 784 785 /** Reservior sampling using a heap. Both weighted and unweighted random sampling are 786 * supported. 787 * 788 * The algorithm used here is based on the one-pass algorithm described by Pavlos 789 * Efraimidis and Paul Spirakis ("Weighted Random Sampling over Data Streams", Pavlos S. 790 * Efraimidis, https://arxiv.org/abs/1012.0256). In the unweighted case weights are 791 * simply set to one. 792 * 793 * The implementation uses a heap (priority queue) large enough to hold the desired 794 * number of lines. Input is read line-by-line, assigned a random value, and added to 795 * the heap. The role of the identify the lines with the highest assigned random 796 * values. Once the heap is full, adding a new line means dropping the line with the 797 * lowest score. A "min" heap used for this reason. 798 * 799 * When done reading all lines, the "min" heap is in the opposite order needed for 800 * output. The desired order is obtained by removing each element one at at time from 801 * the heap. The underlying data store will have the elements in correct order. 802 * 803 * Generating output in weighted order matters for several reasons: 804 * - For weighted sampling, it preserves the property that smaller valid subsets can be 805 * created by taking the first N lines. 806 * - For unweighted sampling, it ensures that all output permutations are possible, and 807 * are not influences by input order or the heap data structure used. 808 * - Order consistency when making repeated use of the same random seeds, but with 809 * different sample sizes. 810 * 811 * There are use cases where only the selection set matters, for these some performance 812 * could be gained by skipping the reordering and simply printing the backing store 813 * array in-order, but making this distinction seems an unnecessary complication. 814 * 815 * Notes: 816 * - In tsv-sample versions 1.2.1 and earlier this routine also supported randomization 817 * of all input lines. This was dropped in version 1.2.2 in favor of the approach 818 * used in randomizeLines. The latter has significant advantages given that all data 819 * data must be read into memory. 820 * - For larger reservoir sizes better performance can be achieved by using 821 * reservoirSamplingAlgorithmR. See the documentation for that function for details. 822 */ 823 void reservoirSamplingViaHeap(Flag!"isWeighted" isWeighted, OutputRange) 824 (TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 825 if (isOutputRange!(OutputRange, char)) 826 { 827 import std.container.array; 828 import std.container.binaryheap; 829 import std.format : formatValue, singleSpec; 830 import std.random : Random = Mt19937, uniform01; 831 import tsvutil : throwIfWindowsNewlineOnUnix; 832 833 static if (isWeighted) assert(cmdopt.hasWeightField); 834 else assert(!cmdopt.hasWeightField); 835 836 assert(cmdopt.sampleSize > 0); 837 838 auto randomGenerator = Random(cmdopt.seed); 839 840 struct Entry 841 { 842 double score; 843 char[] line; 844 } 845 846 /* Create the heap and backing data store. 847 * 848 * Note: An std.container.array is used as the backing store to avoid some issues in 849 * the standard library (Phobos) binaryheap implementation. Specifically, when an 850 * std.container.array is used as backing store, the heap can efficiently reversed by 851 * removing the heap elements. This leaves the backing store in the reversed order. 852 * However, the current binaryheap implementation does not support this for all 853 * backing stores. See: https://issues.dlang.org/show_bug.cgi?id=17094. 854 */ 855 856 Array!Entry dataStore; 857 dataStore.reserve(cmdopt.sampleSize); 858 auto reservoir = dataStore.heapify!("a.score > b.score")(0); // Min binaryheap 859 860 /* Process each line. */ 861 bool headerWritten = false; 862 foreach (filename; cmdopt.files) 863 { 864 auto inputStream = (filename == "-") ? stdin : filename.File(); 865 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 866 { 867 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 868 if (fileLineNum == 1 && cmdopt.hasHeader) 869 { 870 if (!headerWritten) 871 { 872 if (cmdopt.printRandom) 873 { 874 outputStream.put(cmdopt.randomValueHeader); 875 outputStream.put(cmdopt.delim); 876 } 877 outputStream.put(line); 878 outputStream.put("\n"); 879 headerWritten = true; 880 } 881 } 882 else 883 { 884 static if (!isWeighted) 885 { 886 double lineScore = uniform01(randomGenerator); 887 } 888 else 889 { 890 double lineWeight = 891 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, filename, fileLineNum); 892 double lineScore = 893 (lineWeight > 0.0) 894 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 895 : 0.0; 896 } 897 898 if (reservoir.length < cmdopt.sampleSize) 899 { 900 reservoir.insert(Entry(lineScore, line.dup)); 901 } 902 else if (reservoir.front.score < lineScore) 903 { 904 reservoir.replaceFront(Entry(lineScore, line.dup)); 905 } 906 } 907 } 908 } 909 910 /* All entries are in the reservoir. Time to print. The heap is in reverse order 911 * of assigned weights. Reversing order is done by removing all elements from the 912 * heap, this leaves the backing store in the correct order for output. 913 * 914 * The asserts here avoid issues with the current binaryheap implementation. They 915 * detect use of backing stores having a length not synchronized to the reservoir. 916 */ 917 size_t numLines = reservoir.length; 918 assert(numLines == dataStore.length); 919 920 while (!reservoir.empty) reservoir.removeFront; 921 assert(numLines == dataStore.length); 922 923 immutable randomValueFormatSpec = singleSpec("%.17g"); 924 925 foreach (entry; dataStore) 926 { 927 if (cmdopt.printRandom) 928 { 929 outputStream.formatValue(entry.score, randomValueFormatSpec); 930 outputStream.put(cmdopt.delim); 931 } 932 outputStream.put(entry.line); 933 outputStream.put("\n"); 934 } 935 } 936 937 /** Generates weighted random values for all input lines, preserving input order. 938 * 939 * This complements weighted reservoir sampling, but instead of using a reservoir it 940 * simply iterates over the input lines generating the values. The weighted random 941 * values are generated with the same formula used by reservoirSampling. 942 */ 943 void generateWeightedRandomValuesInorder(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 944 if (isOutputRange!(OutputRange, char)) 945 { 946 import std.format : formatValue, singleSpec; 947 import std.random : Random = Mt19937, uniform01; 948 import tsvutil : throwIfWindowsNewlineOnUnix; 949 950 assert(cmdopt.hasWeightField); 951 952 auto randomGenerator = Random(cmdopt.seed); 953 immutable randomValueFormatSpec = singleSpec("%.17g"); 954 955 /* Process each line. */ 956 bool headerWritten = false; 957 size_t numLinesWritten = 0; 958 foreach (filename; cmdopt.files) 959 { 960 auto inputStream = (filename == "-") ? stdin : filename.File(); 961 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 962 { 963 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 964 if (fileLineNum == 1 && cmdopt.hasHeader) 965 { 966 if (!headerWritten) 967 { 968 outputStream.put(cmdopt.randomValueHeader); 969 outputStream.put(cmdopt.delim); 970 outputStream.put(line); 971 outputStream.put("\n"); 972 headerWritten = true; 973 } 974 } 975 else 976 { 977 double lineWeight = getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, 978 filename, fileLineNum); 979 double lineScore = 980 (lineWeight > 0.0) 981 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 982 : 0.0; 983 984 outputStream.formatValue(lineScore, randomValueFormatSpec); 985 outputStream.put(cmdopt.delim); 986 outputStream.put(line); 987 outputStream.put("\n"); 988 989 if (cmdopt.sampleSize != 0) 990 { 991 ++numLinesWritten; 992 if (numLinesWritten == cmdopt.sampleSize) return; 993 } 994 } 995 } 996 } 997 } 998 999 /** Reservoir sampling, Algorithm R 1000 * 1001 * This is an implementation of reservoir sampling using what is commonly known as 1002 * "Algorithm R", credited to Alan Waterman by Donald Knuth in the "The Art of 1003 * Computer Programming, Volume 2: Seminumerical Algorithms". More information about 1004 * the algorithm can be found in Jeffrey Vitter's classic paper "Random Sampling with 1005 * a Reservoir" (1985) as well as the Wikipedia article "Reservoir Sampling" 1006 * (https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_R). 1007 * 1008 * Algorithm R is used for unweighted sampling without replacement. The heap-based 1009 * algorithm in reservoirSamplingViaHeap is used for weighted sampling. 1010 * 1011 * The classic algorithm stops after identifying the selected set of items. This 1012 * implementation goes one step further and randomizes the order of the selected 1013 * lines. This supports the tsv-sample use-case, which is line order randomization. 1014 * 1015 * This algorithm is faster than reservoirSamplingViaHeap when the sample size 1016 * (reservoir size) is large. Heap insertion is O(log k), where k is the sample size. 1017 * Insertion in this algorithm is O(1). Similarly, generating the random order in the 1018 * heap is O(k * log k), while in this algorithm the final randomization step is O(k). 1019 * 1020 * This speed advantage may be offset a certain amount by using a more expensive random 1021 * value generator. reservoirSamplingViaHeap generates values between zero and one, 1022 * whereas reservoirSamplingAlgorithR generates random integers over and ever growing 1023 * interval. The latter is expected to be more expensive. This is consistent with 1024 * performance test indicating that reservoirSamplingViaHeap is faster when using 1025 * small-to-medium size reservoirs and large input streams. 1026 */ 1027 void reservoirSamplingAlgorithmR(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1028 if (isOutputRange!(OutputRange, char)) 1029 { 1030 import std.random : Random = Mt19937, randomShuffle, uniform; 1031 import tsvutil : throwIfWindowsNewlineOnUnix; 1032 1033 assert(cmdopt.sampleSize > 0); 1034 assert(!cmdopt.hasWeightField); 1035 assert(!cmdopt.compatibilityMode); 1036 assert(!cmdopt.printRandom); 1037 assert(!cmdopt.genRandomInorder); 1038 1039 string[] reservoir; 1040 auto reservoirAppender = appender(&reservoir); 1041 reservoirAppender.reserve(cmdopt.sampleSize); 1042 1043 auto randomGenerator = Random(cmdopt.seed); 1044 1045 /* Process each line. */ 1046 1047 bool headerWritten = false; 1048 size_t totalLineNum = 0; 1049 foreach (filename; cmdopt.files) 1050 { 1051 auto inputStream = (filename == "-") ? stdin : filename.File(); 1052 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 1053 { 1054 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 1055 if (fileLineNum == 1 && cmdopt.hasHeader) 1056 { 1057 if (!headerWritten) 1058 { 1059 outputStream.put(line); 1060 outputStream.put("\n"); 1061 headerWritten = true; 1062 } 1063 } 1064 else 1065 { 1066 /* Add lines to the reservoir until the reservoir is filled. 1067 * After that lines are added with decreasing likelihood, based on 1068 * the total number of lines seen. If added to the reservoir, the 1069 * line replaces a randomly chosen existing line. 1070 */ 1071 if (totalLineNum < cmdopt.sampleSize) 1072 { 1073 reservoirAppender ~= line.idup; 1074 } 1075 else 1076 { 1077 size_t i = uniform(0, totalLineNum, randomGenerator); 1078 if (i < reservoir.length) reservoir[i] = line.idup; 1079 } 1080 1081 ++totalLineNum; 1082 } 1083 } 1084 } 1085 1086 /* The random sample is now in the reservior. Shuffle it and print. */ 1087 1088 reservoir.randomShuffle(randomGenerator); 1089 1090 foreach (ref line; reservoir) 1091 { 1092 outputStream.put(line); 1093 outputStream.put("\n"); 1094 } 1095 } 1096 1097 /** Randomize all the lines in files or standard input. 1098 * 1099 * This routine selects the appropriate randomize-lines function and template instantiation 1100 * to use based on the command line arguments. 1101 */ 1102 void randomizeLinesCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1103 if (isOutputRange!(OutputRange, char)) 1104 { 1105 if (cmdopt.hasWeightField) 1106 { 1107 randomizeLinesViaSort!(Yes.isWeighted)(cmdopt, outputStream); 1108 } 1109 else if (cmdopt.compatibilityMode) 1110 { 1111 randomizeLinesViaSort!(No.isWeighted)(cmdopt, outputStream); 1112 } 1113 else 1114 { 1115 randomizeLinesViaShuffle(cmdopt, outputStream); 1116 } 1117 } 1118 1119 /** Randomize all the lines in files or standard input. 1120 * 1121 * All lines in files and/or standard input are read in and written out in random 1122 * order. This algorithm assigns a random value to each line and sorts. This approach 1123 * supports both weighted sampling and simple random sampling (unweighted). 1124 * 1125 * This is significantly faster than heap-based reservoir sampling in the case where 1126 * the entire file is being read. See also randomizeLinesViaShuffle for the unweighted 1127 * case, as it is a little faster, at the cost not supporting random value printing or 1128 * compatibility-mode. 1129 * 1130 * Input data size is limited by available memory. Disk oriented techniques are needed 1131 * when data sizes are larger. For example, generating random values line-by-line (ala 1132 * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort. 1133 */ 1134 void randomizeLinesViaSort(Flag!"isWeighted" isWeighted, OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1135 if (isOutputRange!(OutputRange, char)) 1136 { 1137 import std.algorithm : map, sort; 1138 import std.format : formatValue, singleSpec; 1139 1140 static if (isWeighted) assert(cmdopt.hasWeightField); 1141 else assert(!cmdopt.hasWeightField); 1142 1143 assert(cmdopt.sampleSize == 0); 1144 1145 /* 1146 * Read all file data into memory. Then split the data into lines and assign a 1147 * random value to each line. identifyFileLines also writes the first header line. 1148 */ 1149 auto fileData = cmdopt.files.map!FileData.array; 1150 auto inputLines = fileData.identifyFileLines!(Yes.hasRandomValue, isWeighted)(cmdopt, outputStream); 1151 1152 /* 1153 * Sort by the weight and output the lines. 1154 */ 1155 inputLines.sort!((a, b) => a.randomValue > b.randomValue); 1156 1157 immutable randomValueFormatSpec = singleSpec("%.17g"); 1158 1159 foreach (lineEntry; inputLines) 1160 { 1161 if (cmdopt.printRandom) 1162 { 1163 outputStream.formatValue(lineEntry.randomValue, randomValueFormatSpec); 1164 outputStream.put(cmdopt.delim); 1165 } 1166 outputStream.put(lineEntry.data); 1167 outputStream.put("\n"); 1168 } 1169 } 1170 1171 /** Randomize all the lines in files or standard input. 1172 * 1173 * All lines in files and/or standard input are read in and written out in random 1174 * order. This routine uses array shuffling, which is faster than sorting. This makes 1175 * this routine a good alternative to randomizeLinesViaSort when doing unweighted 1176 * randomization. 1177 * 1178 * Input data size is limited by available memory. Disk oriented techniques are needed 1179 * when data sizes are larger. For example, generating random values line-by-line (ala 1180 * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort. 1181 * 1182 * This routine does not support random value printing or compatibility-mode. 1183 */ 1184 void randomizeLinesViaShuffle(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1185 if (isOutputRange!(OutputRange, char)) 1186 { 1187 import std.algorithm : map; 1188 import std.random : Random = Mt19937, randomShuffle; 1189 1190 assert(cmdopt.sampleSize == 0); 1191 assert(!cmdopt.hasWeightField); 1192 assert(!cmdopt.printRandom); 1193 assert(!cmdopt.genRandomInorder); 1194 1195 /* 1196 * Read all file data into memory and split into lines. 1197 */ 1198 auto fileData = cmdopt.files.map!FileData.array; 1199 auto inputLines = fileData.identifyFileLines!(No.hasRandomValue, No.isWeighted)(cmdopt, outputStream); 1200 1201 /* 1202 * Randomly shuffle and print each line. 1203 * 1204 * Note: Also tried randomCover, but that was exceedingly slow. 1205 */ 1206 import std.random : randomShuffle; 1207 1208 auto randomGenerator = Random(cmdopt.seed); 1209 inputLines.randomShuffle(randomGenerator); 1210 1211 foreach (ref line; inputLines) 1212 { 1213 outputStream.put(line.data); 1214 outputStream.put("\n"); 1215 } 1216 } 1217 1218 /** Simple random sampling with replacement. 1219 * 1220 * All lines in files and/or standard input are read in. Then random lines are selected 1221 * one at a time and output. Lines can be selected multiple times. This process continues 1222 * until the desired number of samples (--n|num) has been output. Output continues 1223 * indefinitely if a sample size was not provided. 1224 */ 1225 void simpleRandomSamplingWithReplacement(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1226 if (isOutputRange!(OutputRange, char)) 1227 { 1228 import std.algorithm : map; 1229 import std.format : formatValue, singleSpec; 1230 import std.random : Random = Mt19937, uniform; 1231 1232 /* 1233 * Read all file data into memory and split the data into lines. 1234 */ 1235 auto fileData = cmdopt.files.map!FileData.array; 1236 auto inputLines = fileData.identifyFileLines!(No.hasRandomValue, No.isWeighted)(cmdopt, outputStream); 1237 1238 if (inputLines.length > 0) 1239 { 1240 auto randomGenerator = Random(cmdopt.seed); 1241 1242 /* Repeat forever is sampleSize is zero, otherwise print sampleSize lines. */ 1243 size_t numLeft = (cmdopt.sampleSize == 0) ? 1 : cmdopt.sampleSize; 1244 while (numLeft != 0) 1245 { 1246 size_t index = uniform(0, inputLines.length, randomGenerator); 1247 outputStream.put(inputLines[index].data); 1248 outputStream.put("\n"); 1249 if (cmdopt.sampleSize != 0) numLeft--; 1250 } 1251 } 1252 } 1253 1254 /** A container and reader data form a file or standard input. 1255 * 1256 * The FileData struct is used to read data from a file or standard input. It is used 1257 * by passing a filename to the constructor. The constructor reads the file data. 1258 * If the filename is a single hyphen ('-') then data is read from standard input. 1259 * 1260 * The struct make the data available through two members: 'filename', which is the 1261 * filename, and 'data', which is a character array of the data. 1262 */ 1263 struct FileData 1264 { 1265 string filename; 1266 char[] data; 1267 1268 this(string fname) 1269 { 1270 import std.algorithm : min; 1271 import std.array : appender; 1272 1273 filename = fname; 1274 1275 ubyte[1024 * 128] fileRawBuf; 1276 auto dataAppender = appender(&data); 1277 auto ifile = (filename == "-") ? stdin : filename.File; 1278 1279 if (filename != "-") 1280 { 1281 ulong filesize = ifile.size; 1282 if (filesize < ulong.max) dataAppender.reserve(min(filesize, size_t.max)); 1283 } 1284 1285 foreach (ref ubyte[] buffer; ifile.byChunk(fileRawBuf)) dataAppender.put(cast(char[]) buffer); 1286 } 1287 } 1288 1289 /** HasRandomValue is a boolean flag used at compile time by identifyFileLines to 1290 * distinguish use cases needing random value assignments from those that don't. 1291 */ 1292 alias HasRandomValue = Flag!"hasRandomValue"; 1293 1294 /** An InputLine array is returned by identifyFileLines to represent each non-header line 1295 * line found in a FileData array. The 'data' element contains the line. A 'randomValue' 1296 * line is included if random values are being generated. 1297 */ 1298 struct InputLine(HasRandomValue hasRandomValue) 1299 { 1300 char[] data; 1301 static if (hasRandomValue) double randomValue; 1302 } 1303 1304 /** identifyFileLines is used by algorithms that read all files into memory prior to 1305 * processing. It does the initial processing of the file data. 1306 * 1307 * Three primary tasks are performed. One is splitting all input data into lines. The 1308 * second is writting the header line from the first file to the output stream. Header 1309 * lines from subsequent files are ignored. Third is assigning a random value to the 1310 * line, if random values are being generated. 1311 * 1312 * The key input is a FileData array, one element for each file. The FileData reads 1313 * the file when instantiated. 1314 * 1315 * The return value is an array of InputLine structs. The struct will have a 'randomValue' 1316 * member if random values are being assigned. 1317 */ 1318 InputLine!hasRandomValue[] identifyFileLines(HasRandomValue hasRandomValue, Flag!"isWeighted" isWeighted, OutputRange) 1319 (ref FileData[] fileData, TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1320 if (isOutputRange!(OutputRange, char)) 1321 { 1322 import std.algorithm : splitter; 1323 import std.array : appender; 1324 import std.random : Random = Mt19937, uniform01; 1325 import tsvutil : throwIfWindowsNewlineOnUnix; 1326 1327 static assert(hasRandomValue || !isWeighted); 1328 static if(!hasRandomValue) assert(!cmdopt.printRandom); 1329 1330 InputLine!hasRandomValue[] inputLines; 1331 1332 auto linesAppender = appender(&inputLines); 1333 static if (hasRandomValue) auto randomGenerator = Random(cmdopt.seed); 1334 bool headerWritten = false; 1335 1336 foreach (fd; fileData) 1337 { 1338 /* Drop the last newline to avoid adding an extra empty line. */ 1339 auto data = (fd.data.length > 0 && fd.data[$ - 1] == '\n') ? fd.data[0 .. $ - 1] : fd.data; 1340 foreach (fileLineNum, ref line; data.splitter('\n').enumerate(1)) 1341 { 1342 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, fd.filename, fileLineNum); 1343 if (fileLineNum == 1 && cmdopt.hasHeader) 1344 { 1345 if (!headerWritten) 1346 { 1347 if (cmdopt.printRandom) 1348 { 1349 outputStream.put(cmdopt.randomValueHeader); 1350 outputStream.put(cmdopt.delim); 1351 } 1352 outputStream.put(line); 1353 outputStream.put("\n"); 1354 headerWritten = true; 1355 } 1356 } 1357 else 1358 { 1359 static if (!hasRandomValue) 1360 { 1361 linesAppender.put(InputLine!hasRandomValue(line)); 1362 } 1363 else 1364 { 1365 static if (!isWeighted) 1366 { 1367 double randomValue = uniform01(randomGenerator); 1368 } 1369 else 1370 { 1371 double lineWeight = 1372 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, 1373 fd.filename, fileLineNum); 1374 double randomValue = 1375 (lineWeight > 0.0) 1376 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 1377 : 0.0; 1378 } 1379 1380 linesAppender.put(InputLine!hasRandomValue(line, randomValue)); 1381 } 1382 } 1383 } 1384 } 1385 1386 return inputLines; 1387 } 1388 1389 1390 /** Convenience function for extracting a single field from a line. See getTsvFieldValue in 1391 * common/src/tsvutils.d for details. This wrapper creates error text tailored for this program. 1392 */ 1393 import std.traits : isSomeChar; 1394 T getFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim, string filename, size_t lineNum) pure @safe 1395 if (isSomeChar!C) 1396 { 1397 import std.conv : ConvException, to; 1398 import std.format : format; 1399 import tsvutil : getTsvFieldValue; 1400 1401 T val; 1402 try 1403 { 1404 val = getTsvFieldValue!T(line, fieldIndex, delim); 1405 } 1406 catch (ConvException exc) 1407 { 1408 throw new Exception( 1409 format("Could not process line: %s\n File: %s Line: %s%s", 1410 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum, 1411 (lineNum == 1) ? "\n Is this a header line? Use --H|header to skip." : "")); 1412 } 1413 catch (Exception exc) 1414 { 1415 /* Not enough fields on the line. */ 1416 throw new Exception( 1417 format("Could not process line: %s\n File: %s Line: %s", 1418 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum)); 1419 } 1420 1421 return val; 1422 } 1423 1424 unittest 1425 { 1426 /* getFieldValue unit tests. getTsvFieldValue has it's own tests. 1427 * These tests make basic sanity checks on the getFieldValue wrapper. 1428 */ 1429 import std.exception; 1430 1431 assert(getFieldValue!double("123", 0, '\t', "unittest", 1) == 123); 1432 assert(getFieldValue!double("123.4", 0, '\t', "unittest", 1) == 123.4); 1433 assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 1)); 1434 assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 2)); 1435 assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 1)); 1436 assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 2)); 1437 } 1438 1439 /* Unit tests for the main program start here. 1440 * 1441 * Portability note: Many of the tests here rely on generating consistent random numbers 1442 * across different platforms when using the same random seed. So far this has succeeded 1443 * on several different platorm, compiler, and library versions. However, it is certainly 1444 * possible this condition will not hold on other platforms. 1445 * 1446 * For tsv-sample, this portability implies generating the same results on different 1447 * platforms when using the same random seed. This is NOT part of tsv-sample guarantees, 1448 * but it is convenient for testing. If platforms are identified that do not generate 1449 * the same results these tests will need to be adjusted. 1450 */ 1451 version(unittest) 1452 { 1453 /* Unit test helper functions. */ 1454 1455 import unittest_utils; // tsv unit test helpers, from common/src/. 1456 import std.conv : to; 1457 1458 void testTsvSample(string[] cmdArgs, string[][] expected) 1459 { 1460 import std.array : appender; 1461 import std.format : format; 1462 1463 assert(cmdArgs.length > 0, "[testTsvSample] cmdArgs must not be empty."); 1464 1465 auto formatAssertMessage(T...)(string msg, T formatArgs) 1466 { 1467 auto formatString = "[testTsvSample] %s: " ~ msg; 1468 return format(formatString, cmdArgs[0], formatArgs); 1469 } 1470 1471 TsvSampleOptions cmdopt; 1472 auto savedCmdArgs = cmdArgs.to!string; 1473 auto r = cmdopt.processArgs(cmdArgs); 1474 assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); 1475 auto output = appender!(char[])(); 1476 1477 tsvSample(cmdopt, output); // This invokes the main code line. 1478 1479 auto expectedOutput = expected.tsvDataToString; 1480 1481 assert(output.data == expectedOutput, 1482 formatAssertMessage( 1483 "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================", 1484 expectedOutput.to!string, output.data.to!string)); 1485 } 1486 } 1487 1488 unittest 1489 { 1490 import std.path : buildPath; 1491 import std.file : rmdirRecurse; 1492 import std.format : format; 1493 1494 auto testDir = makeUnittestTempDir("tsv_sample"); 1495 scope(exit) testDir.rmdirRecurse; 1496 1497 /* Tabular data sets and expected results use the built-in static seed. 1498 * Tests are run by writing the data set to a file, then calling the main 1499 * routine to process. The function testTsvSample plays the role of the 1500 * main program. Rather than writing to expected output, the results are 1501 * matched against expected. The expected results were verified by hand 1502 * prior to inclusion in the test. 1503 * 1504 * The initial part of this section is simply setting up data files and 1505 * expected results. 1506 * 1507 * Expected results naming conventions: 1508 * - Prefix: dataNxMExpected. N and M are numbers. e.g. data3x6Expected 1509 * - Sampling Type (required): Permute, Replace, Bernoulli, Distinct 1510 * - Compatibility: Compat, AlgoR, Skip, Swap 1511 * - Weight Field: Wt<num>, e.g. Wt3 1512 * - Sample Size: Num<num>, eg. Num3 1513 * - Seed Value: V<num>, eg. V77 1514 * - Key Field: K<num>, e.g. K2 1515 * - Probability: P<num>, e.g P05 (5%) 1516 * - Printing Probalities: Probs 1517 * - Printing Probs in order: ProbsInorder 1518 * - Printing Probs with custom header: RVCustom 1519 */ 1520 1521 /* Empty file. */ 1522 string[][] dataEmpty = []; 1523 string fpath_dataEmpty = buildPath(testDir, "dataEmpty.tsv"); 1524 writeUnittestTsvFile(fpath_dataEmpty, dataEmpty); 1525 1526 /* 3x1, header only. */ 1527 string[][] data3x0 = [["field_a", "field_b", "field_c"]]; 1528 string fpath_data3x0 = buildPath(testDir, "data3x0.tsv"); 1529 writeUnittestTsvFile(fpath_data3x0, data3x0); 1530 1531 /* 3x1 */ 1532 string[][] data3x1 = 1533 [["field_a", "field_b", "field_c"], 1534 ["tan", "タン", "8.5"]]; 1535 1536 string fpath_data3x1 = buildPath(testDir, "data3x1.tsv"); 1537 string fpath_data3x1_noheader = buildPath(testDir, "data3x1_noheader.tsv"); 1538 writeUnittestTsvFile(fpath_data3x1, data3x1); 1539 writeUnittestTsvFile(fpath_data3x1_noheader, data3x1[1..$]); 1540 1541 string[][] data3x1ExpectedReplaceNum3 = 1542 [["field_a", "field_b", "field_c"], 1543 ["tan", "タン", "8.5"], 1544 ["tan", "タン", "8.5"], 1545 ["tan", "タン", "8.5"]]; 1546 1547 /* 3x2 */ 1548 string[][] data3x2 = 1549 [["field_a", "field_b", "field_c"], 1550 ["brown", "褐色", "29.2"], 1551 ["gray", "グレー", "6.2"]]; 1552 1553 string fpath_data3x2 = buildPath(testDir, "data3x2.tsv"); 1554 string fpath_data3x2_noheader = buildPath(testDir, "data3x2_noheader.tsv"); 1555 writeUnittestTsvFile(fpath_data3x2, data3x2); 1556 writeUnittestTsvFile(fpath_data3x2_noheader, data3x2[1..$]); 1557 1558 string[][] data3x2PermuteCompat = 1559 [["field_a", "field_b", "field_c"], 1560 ["gray", "グレー", "6.2"], 1561 ["brown", "褐色", "29.2"]]; 1562 1563 string[][] data3x2PermuteShuffle = 1564 [["field_a", "field_b", "field_c"], 1565 ["gray", "グレー", "6.2"], 1566 ["brown", "褐色", "29.2"]]; 1567 1568 /* 3x3 */ 1569 string[][] data3x3 = 1570 [["field_a", "field_b", "field_c"], 1571 ["orange", "オレンジ", "2.5"], 1572 ["pink", "ピンク", "1.1"], 1573 ["purple", "紫の", "42"]]; 1574 1575 string fpath_data3x3 = buildPath(testDir, "data3x3.tsv"); 1576 string fpath_data3x3_noheader = buildPath(testDir, "data3x3_noheader.tsv"); 1577 writeUnittestTsvFile(fpath_data3x3, data3x3); 1578 writeUnittestTsvFile(fpath_data3x3_noheader, data3x3[1..$]); 1579 1580 string[][] data3x3ExpectedPermuteCompat = 1581 [["field_a", "field_b", "field_c"], 1582 ["purple", "紫の", "42"], 1583 ["pink", "ピンク", "1.1"], 1584 ["orange", "オレンジ", "2.5"]]; 1585 1586 string[][] data3x3ExpectedPermuteSwap = 1587 [["field_a", "field_b", "field_c"], 1588 ["purple", "紫の", "42"], 1589 ["orange", "オレンジ", "2.5"], 1590 ["pink", "ピンク", "1.1"]]; 1591 1592 /* 3x6 */ 1593 string[][] data3x6 = 1594 [["field_a", "field_b", "field_c"], 1595 ["red", "赤", "23.8"], 1596 ["green", "緑", "0.0072"], 1597 ["white", "白", "1.65"], 1598 ["yellow", "黄", "12"], 1599 ["blue", "青", "12"], 1600 ["black", "黒", "0.983"]]; 1601 string fpath_data3x6 = buildPath(testDir, "data3x6.tsv"); 1602 string fpath_data3x6_noheader = buildPath(testDir, "data3x6_noheader.tsv"); 1603 writeUnittestTsvFile(fpath_data3x6, data3x6); 1604 writeUnittestTsvFile(fpath_data3x6_noheader, data3x6[1..$]); 1605 1606 // Randomization, all lines 1607 string[][] data3x6ExpectedPermuteCompat = 1608 [["field_a", "field_b", "field_c"], 1609 ["yellow", "黄", "12"], 1610 ["black", "黒", "0.983"], 1611 ["blue", "青", "12"], 1612 ["white", "白", "1.65"], 1613 ["green", "緑", "0.0072"], 1614 ["red", "赤", "23.8"]]; 1615 1616 string[][] data3x6ExpectedPermuteSwap = 1617 [["field_a", "field_b", "field_c"], 1618 ["black", "黒", "0.983"], 1619 ["green", "緑", "0.0072"], 1620 ["red", "赤", "23.8"], 1621 ["yellow", "黄", "12"], 1622 ["white", "白", "1.65"], 1623 ["blue", "青", "12"]]; 1624 1625 string[][] data3x6ExpectedPermuteCompatProbs = 1626 [["random_value", "field_a", "field_b", "field_c"], 1627 ["0.96055546286515892", "yellow", "黄", "12"], 1628 ["0.7571015392895788", "black", "黒", "0.983"], 1629 ["0.52525980887003243", "blue", "青", "12"], 1630 ["0.49287854949943721", "white", "白", "1.65"], 1631 ["0.15929344086907804", "green", "緑", "0.0072"], 1632 ["0.010968807619065046", "red", "赤", "23.8"]]; 1633 1634 /* Note: data3x6ExpectedAlgoRNum6 is identical to data3x6ExpectedPermuteSwap because 1635 * both are effectively the same algorithm given that --num is data length. Both read 1636 * in the full data in order then call randomShuffle. 1637 */ 1638 string[][] data3x6ExpectedPermuteAlgoRNum6 = 1639 [["field_a", "field_b", "field_c"], 1640 ["black", "黒", "0.983"], 1641 ["green", "緑", "0.0072"], 1642 ["red", "赤", "23.8"], 1643 ["yellow", "黄", "12"], 1644 ["white", "白", "1.65"], 1645 ["blue", "青", "12"]]; 1646 1647 string[][] data3x6ExpectedPermuteAlgoRNum5 = 1648 [["field_a", "field_b", "field_c"], 1649 ["red", "赤", "23.8"], 1650 ["black", "黒", "0.983"], 1651 ["white", "白", "1.65"], 1652 ["green", "緑", "0.0072"], 1653 ["yellow", "黄", "12"]]; 1654 1655 string[][] data3x6ExpectedPermuteAlgoRNum4 = 1656 [["field_a", "field_b", "field_c"], 1657 ["blue", "青", "12"], 1658 ["green", "緑", "0.0072"], 1659 ["black", "黒", "0.983"], 1660 ["white", "白", "1.65"]]; 1661 1662 string[][] data3x6ExpectedPermuteAlgoRNum3 = 1663 [["field_a", "field_b", "field_c"], 1664 ["red", "赤", "23.8"], 1665 ["black", "黒", "0.983"], 1666 ["green", "緑", "0.0072"]]; 1667 1668 string[][] data3x6ExpectedPermuteAlgoRNum2 = 1669 [["field_a", "field_b", "field_c"], 1670 ["black", "黒", "0.983"], 1671 ["red", "赤", "23.8"]]; 1672 1673 string[][] data3x6ExpectedPermuteAlgoRNum1 = 1674 [["field_a", "field_b", "field_c"], 1675 ["green", "緑", "0.0072"]]; 1676 1677 string[][] data3x6ExpectedBernoulliProbsP100 = 1678 [["random_value", "field_a", "field_b", "field_c"], 1679 ["0.010968807619065046", "red", "赤", "23.8"], 1680 ["0.15929344086907804", "green", "緑", "0.0072"], 1681 ["0.49287854949943721", "white", "白", "1.65"], 1682 ["0.96055546286515892", "yellow", "黄", "12"], 1683 ["0.52525980887003243", "blue", "青", "12"], 1684 ["0.7571015392895788", "black", "黒", "0.983"]]; 1685 1686 string[][] data3x6ExpectedBernoulliCompatProbsP60 = 1687 [["random_value", "field_a", "field_b", "field_c"], 1688 ["0.010968807619065046", "red", "赤", "23.8"], 1689 ["0.15929344086907804", "green", "緑", "0.0072"], 1690 ["0.49287854949943721", "white", "白", "1.65"], 1691 ["0.52525980887003243", "blue", "青", "12"]]; 1692 1693 string[][] data3x6ExpectedBernoulliSkipP40 = 1694 [["field_a", "field_b", "field_c"], 1695 ["red", "赤", "23.8"], 1696 ["green", "緑", "0.0072"], 1697 ["yellow", "黄", "12"]]; 1698 1699 string[][] data3x6ExpectedBernoulliCompatP60 = 1700 [["field_a", "field_b", "field_c"], 1701 ["red", "赤", "23.8"], 1702 ["green", "緑", "0.0072"], 1703 ["white", "白", "1.65"], 1704 ["blue", "青", "12"]]; 1705 1706 string[][] data3x6ExpectedDistinctK1K3P60 = 1707 [["field_a", "field_b", "field_c"], 1708 ["green", "緑", "0.0072"], 1709 ["white", "白", "1.65"], 1710 ["blue", "青", "12"]]; 1711 1712 string[][] data3x6ExpectedDistinctK1K3P60Probs = 1713 [["random_value", "field_a", "field_b", "field_c"], 1714 ["0", "green", "緑", "0.0072"], 1715 ["0", "white", "白", "1.65"], 1716 ["0", "blue", "青", "12"]]; 1717 1718 string[][] data3x6ExpectedDistinctK1K3P60ProbsRVCustom = 1719 [["custom_random_value_header", "field_a", "field_b", "field_c"], 1720 ["0", "green", "緑", "0.0072"], 1721 ["0", "white", "白", "1.65"], 1722 ["0", "blue", "青", "12"]]; 1723 1724 string[][] data3x6ExpectedDistinctK2P2ProbsInorder = 1725 [["random_value", "field_a", "field_b", "field_c"], 1726 ["1", "red", "赤", "23.8"], 1727 ["0", "green", "緑", "0.0072"], 1728 ["0", "white", "白", "1.65"], 1729 ["1", "yellow", "黄", "12"], 1730 ["3", "blue", "青", "12"], 1731 ["2", "black", "黒", "0.983"]]; 1732 1733 string[][] data3x6ExpectedPermuteWt3Probs = 1734 [["random_value", "field_a", "field_b", "field_c"], 1735 ["0.9966519875764539", "yellow", "黄", "12"], 1736 ["0.94775884809836686", "blue", "青", "12"], 1737 ["0.82728234682286661", "red", "赤", "23.8"], 1738 ["0.75346697377181959", "black", "黒", "0.983"], 1739 ["0.65130103496422487", "white", "白", "1.65"], 1740 ["1.5636943712879866e-111", "green", "緑", "0.0072"]]; 1741 1742 string[][] data3x6ExpectedWt3ProbsInorder = 1743 [["random_value", "field_a", "field_b", "field_c"], 1744 ["0.82728234682286661", "red", "赤", "23.8"], 1745 ["1.5636943712879866e-111", "green", "緑", "0.0072"], 1746 ["0.65130103496422487", "white", "白", "1.65"], 1747 ["0.9966519875764539", "yellow", "黄", "12"], 1748 ["0.94775884809836686", "blue", "青", "12"], 1749 ["0.75346697377181959", "black", "黒", "0.983"]]; 1750 1751 string[][] data3x6ExpectedPermuteWt3 = 1752 [["field_a", "field_b", "field_c"], 1753 ["yellow", "黄", "12"], 1754 ["blue", "青", "12"], 1755 ["red", "赤", "23.8"], 1756 ["black", "黒", "0.983"], 1757 ["white", "白", "1.65"], 1758 ["green", "緑", "0.0072"]]; 1759 1760 string[][] data3x6ExpectedReplaceNum10 = 1761 [["field_a", "field_b", "field_c"], 1762 ["black", "黒", "0.983"], 1763 ["green", "緑", "0.0072"], 1764 ["green", "緑", "0.0072"], 1765 ["red", "赤", "23.8"], 1766 ["yellow", "黄", "12"], 1767 ["red", "赤", "23.8"], 1768 ["white", "白", "1.65"], 1769 ["yellow", "黄", "12"], 1770 ["yellow", "黄", "12"], 1771 ["white", "白", "1.65"], 1772 ]; 1773 1774 string[][] data3x6ExpectedReplaceNum10V77 = 1775 [["field_a", "field_b", "field_c"], 1776 ["black", "黒", "0.983"], 1777 ["red", "赤", "23.8"], 1778 ["black", "黒", "0.983"], 1779 ["yellow", "黄", "12"], 1780 ["green", "緑", "0.0072"], 1781 ["green", "緑", "0.0072"], 1782 ["green", "緑", "0.0072"], 1783 ["yellow", "黄", "12"], 1784 ["blue", "青", "12"], 1785 ["white", "白", "1.65"], 1786 ]; 1787 1788 /* Using a different static seed. */ 1789 string[][] data3x6ExpectedPermuteCompatV41Probs = 1790 [["random_value", "field_a", "field_b", "field_c"], 1791 ["0.68057272653095424", "green", "緑", "0.0072"], 1792 ["0.67681624367833138", "blue", "青", "12"], 1793 ["0.32097338931635022", "yellow", "黄", "12"], 1794 ["0.25092361867427826", "red", "赤", "23.8"], 1795 ["0.15535934292711318", "black", "黒", "0.983"], 1796 ["0.04609582107514143", "white", "白", "1.65"]]; 1797 1798 string[][] data3x6ExpectedBernoulliCompatP60V41Probs = 1799 [["random_value", "field_a", "field_b", "field_c"], 1800 ["0.25092361867427826", "red", "赤", "23.8"], 1801 ["0.04609582107514143", "white", "白", "1.65"], 1802 ["0.32097338931635022", "yellow", "黄", "12"], 1803 ["0.15535934292711318", "black", "黒", "0.983"]]; 1804 1805 string[][] data3x6ExpectedPermuteWt3V41Probs = 1806 [["random_value", "field_a", "field_b", "field_c"], 1807 ["0.96799377498910666", "blue", "青", "12"], 1808 ["0.94356245792573568", "red", "赤", "23.8"], 1809 ["0.90964601024271996", "yellow", "黄", "12"], 1810 ["0.15491658409260103", "white", "白", "1.65"], 1811 ["0.15043620392537033", "black", "黒", "0.983"], 1812 ["6.1394674830701461e-24", "green", "緑", "0.0072"]]; 1813 1814 string[][] data3x6ExpectedWt3V41ProbsInorder = 1815 [["random_value", "field_a", "field_b", "field_c"], 1816 ["0.94356245792573568", "red", "赤", "23.8"], 1817 ["6.1394674830701461e-24", "green", "緑", "0.0072"], 1818 ["0.15491658409260103", "white", "白", "1.65"], 1819 ["0.90964601024271996", "yellow", "黄", "12"], 1820 ["0.96799377498910666", "blue", "青", "12"], 1821 ["0.15043620392537033", "black", "黒", "0.983"]]; 1822 1823 1824 /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */ 1825 string[][] combo1ExpectedPermuteCompat = 1826 [["field_a", "field_b", "field_c"], 1827 ["yellow", "黄", "12"], 1828 ["tan", "タン", "8.5"], 1829 ["brown", "褐色", "29.2"], 1830 ["green", "緑", "0.0072"], 1831 ["red", "赤", "23.8"], 1832 ["purple", "紫の", "42"], 1833 ["black", "黒", "0.983"], 1834 ["white", "白", "1.65"], 1835 ["gray", "グレー", "6.2"], 1836 ["blue", "青", "12"], 1837 ["pink", "ピンク", "1.1"], 1838 ["orange", "オレンジ", "2.5"]]; 1839 1840 string[][] combo1ExpectedPermuteCompatProbs = 1841 [["random_value", "field_a", "field_b", "field_c"], 1842 ["0.97088520275428891", "yellow", "黄", "12"], 1843 ["0.96055546286515892", "tan", "タン", "8.5"], 1844 ["0.81756894313730299", "brown", "褐色", "29.2"], 1845 ["0.7571015392895788", "green", "緑", "0.0072"], 1846 ["0.52525980887003243", "red", "赤", "23.8"], 1847 ["0.49287854949943721", "purple", "紫の", "42"], 1848 ["0.47081507067196071", "black", "黒", "0.983"], 1849 ["0.38388182921335101", "white", "白", "1.65"], 1850 ["0.29215990612283349", "gray", "グレー", "6.2"], 1851 ["0.24033216014504433", "blue", "青", "12"], 1852 ["0.15929344086907804", "pink", "ピンク", "1.1"], 1853 ["0.010968807619065046", "orange", "オレンジ", "2.5"]]; 1854 1855 /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */ 1856 string[][] combo1ExpectedProbsInorder = 1857 [["random_value", "field_a", "field_b", "field_c"], 1858 ["0.010968807619065046", "orange", "オレンジ", "2.5"], 1859 ["0.15929344086907804", "pink", "ピンク", "1.1"], 1860 ["0.49287854949943721", "purple", "紫の", "42"], 1861 ["0.96055546286515892", "tan", "タン", "8.5"], 1862 ["0.52525980887003243", "red", "赤", "23.8"], 1863 ["0.7571015392895788", "green", "緑", "0.0072"], 1864 ["0.38388182921335101", "white", "白", "1.65"], 1865 ["0.97088520275428891", "yellow", "黄", "12"], 1866 ["0.24033216014504433", "blue", "青", "12"], 1867 ["0.47081507067196071", "black", "黒", "0.983"], 1868 ["0.81756894313730299", "brown", "褐色", "29.2"], 1869 ["0.29215990612283349", "gray", "グレー", "6.2"]]; 1870 1871 string[][] combo1ExpectedBernoulliCompatP50Probs = 1872 [["random_value", "field_a", "field_b", "field_c"], 1873 ["0.010968807619065046", "orange", "オレンジ", "2.5"], 1874 ["0.15929344086907804", "pink", "ピンク", "1.1"], 1875 ["0.49287854949943721", "purple", "紫の", "42"], 1876 ["0.38388182921335101", "white", "白", "1.65"], 1877 ["0.24033216014504433", "blue", "青", "12"], 1878 ["0.47081507067196071", "black", "黒", "0.983"], 1879 ["0.29215990612283349", "gray", "グレー", "6.2"]]; 1880 1881 string[][] combo1ExpectedBernoulliCompatP40 = 1882 [["field_a", "field_b", "field_c"], 1883 ["orange", "オレンジ", "2.5"], 1884 ["pink", "ピンク", "1.1"], 1885 ["white", "白", "1.65"], 1886 ["blue", "青", "12"], 1887 ["gray", "グレー", "6.2"]]; 1888 1889 string[][] combo1ExpectedDistinctK1P40 = 1890 [["field_a", "field_b", "field_c"], 1891 ["orange", "オレンジ", "2.5"], 1892 ["red", "赤", "23.8"], 1893 ["green", "緑", "0.0072"], 1894 ["blue", "青", "12"], 1895 ["black", "黒", "0.983"]]; 1896 1897 string[][] combo1ExpectedPermuteWt3Probs = 1898 [["random_value", "field_a", "field_b", "field_c"], 1899 ["0.99754077523718754", "yellow", "黄", "12"], 1900 ["0.99527665440088786", "tan", "タン", "8.5"], 1901 ["0.99312578945741659", "brown", "褐色", "29.2"], 1902 ["0.98329602553389361", "purple", "紫の", "42"], 1903 ["0.9733096193808366", "red", "赤", "23.8"], 1904 ["0.88797551521739648", "blue", "青", "12"], 1905 ["0.81999230489041786", "gray", "グレー", "6.2"], 1906 ["0.55975569204250941", "white", "白", "1.65"], 1907 ["0.46472135609205739", "black", "黒", "0.983"], 1908 ["0.18824582704191337", "pink", "ピンク", "1.1"], 1909 ["0.1644613185329992", "orange", "オレンジ", "2.5"], 1910 ["1.6438086931020549e-17", "green", "緑", "0.0072"]]; 1911 1912 string[][] combo1ExpectedPermuteWt3 = 1913 [["field_a", "field_b", "field_c"], 1914 ["yellow", "黄", "12"], 1915 ["tan", "タン", "8.5"], 1916 ["brown", "褐色", "29.2"], 1917 ["purple", "紫の", "42"], 1918 ["red", "赤", "23.8"], 1919 ["blue", "青", "12"], 1920 ["gray", "グレー", "6.2"], 1921 ["white", "白", "1.65"], 1922 ["black", "黒", "0.983"], 1923 ["pink", "ピンク", "1.1"], 1924 ["orange", "オレンジ", "2.5"], 1925 ["green", "緑", "0.0072"]]; 1926 1927 string[][] combo1ExpectedPermuteAlgoRNum4 = 1928 [["field_a", "field_b", "field_c"], 1929 ["blue", "青", "12"], 1930 ["gray", "グレー", "6.2"], 1931 ["brown", "褐色", "29.2"], 1932 ["white", "白", "1.65"]]; 1933 1934 string[][] combo1ExpectedReplaceNum10 = 1935 [["field_a", "field_b", "field_c"], 1936 ["gray", "グレー", "6.2"], 1937 ["yellow", "黄", "12"], 1938 ["yellow", "黄", "12"], 1939 ["white", "白", "1.65"], 1940 ["tan", "タン", "8.5"], 1941 ["white", "白", "1.65"], 1942 ["blue", "青", "12"], 1943 ["black", "黒", "0.983"], 1944 ["tan", "タン", "8.5"], 1945 ["purple", "紫の", "42"]]; 1946 1947 /* 1x200 - Needed for testing bernoulliSkipSampling, invoked with prob < 0.04. */ 1948 string[][] data1x200 = 1949 [["field_a"], 1950 ["000"], ["001"], ["002"], ["003"], ["004"], ["005"], ["006"], ["007"], ["008"], ["009"], 1951 ["010"], ["011"], ["012"], ["013"], ["014"], ["015"], ["016"], ["017"], ["018"], ["019"], 1952 ["020"], ["021"], ["022"], ["023"], ["024"], ["025"], ["026"], ["027"], ["028"], ["029"], 1953 ["030"], ["031"], ["032"], ["033"], ["034"], ["035"], ["036"], ["037"], ["038"], ["039"], 1954 ["040"], ["041"], ["042"], ["043"], ["044"], ["045"], ["046"], ["047"], ["048"], ["049"], 1955 ["050"], ["051"], ["052"], ["053"], ["054"], ["055"], ["056"], ["057"], ["058"], ["059"], 1956 ["060"], ["061"], ["062"], ["063"], ["064"], ["065"], ["066"], ["067"], ["068"], ["069"], 1957 ["070"], ["071"], ["072"], ["073"], ["074"], ["075"], ["076"], ["077"], ["078"], ["079"], 1958 ["080"], ["081"], ["082"], ["083"], ["084"], ["085"], ["086"], ["087"], ["088"], ["089"], 1959 ["090"], ["091"], ["092"], ["093"], ["094"], ["095"], ["096"], ["097"], ["098"], ["099"], 1960 ["100"], ["101"], ["102"], ["103"], ["104"], ["105"], ["106"], ["107"], ["108"], ["109"], 1961 ["110"], ["111"], ["112"], ["113"], ["114"], ["115"], ["116"], ["117"], ["118"], ["119"], 1962 ["120"], ["121"], ["122"], ["123"], ["124"], ["125"], ["126"], ["127"], ["128"], ["129"], 1963 ["130"], ["131"], ["132"], ["133"], ["134"], ["135"], ["136"], ["137"], ["138"], ["139"], 1964 ["140"], ["141"], ["142"], ["143"], ["144"], ["145"], ["146"], ["147"], ["148"], ["149"], 1965 ["150"], ["151"], ["152"], ["153"], ["154"], ["155"], ["156"], ["157"], ["158"], ["159"], 1966 ["160"], ["161"], ["162"], ["163"], ["164"], ["165"], ["166"], ["167"], ["168"], ["169"], 1967 ["170"], ["171"], ["172"], ["173"], ["174"], ["175"], ["176"], ["177"], ["178"], ["179"], 1968 ["180"], ["181"], ["182"], ["183"], ["184"], ["185"], ["186"], ["187"], ["188"], ["189"], 1969 ["190"], ["191"], ["192"], ["193"], ["194"], ["195"], ["196"], ["197"], ["198"], ["199"], 1970 ]; 1971 1972 string fpath_data1x200 = buildPath(testDir, "data1x200.tsv"); 1973 string fpath_data1x200_noheader = buildPath(testDir, "data1x200_noheader.tsv"); 1974 writeUnittestTsvFile(fpath_data1x200, data1x200); 1975 writeUnittestTsvFile(fpath_data1x200_noheader, data1x200[1..$]); 1976 1977 string[][] data1x200ExpectedBernoulliSkipV333P01 = 1978 [["field_a"], 1979 ["077"], 1980 ["119"]]; 1981 1982 string[][] data1x200ExpectedBernoulliSkipV333P02 = 1983 [["field_a"], 1984 ["038"], 1985 ["059"], 1986 ["124"], 1987 ["161"], 1988 ["162"], 1989 ["183"]]; 1990 1991 string[][] data1x200ExpectedBernoulliSkipV333P03 = 1992 [["field_a"], 1993 ["025"], 1994 ["039"], 1995 ["082"], 1996 ["107"], 1997 ["108"], 1998 ["122"], 1999 ["136"], 2000 ["166"], 2001 ["182"]]; 2002 2003 string[][] data1x200ExpectedBernoulliCompatV333P01 = 2004 [["field_a"], 2005 ["072"]]; 2006 2007 string[][] data1x200ExpectedBernoulliCompatV333P02 = 2008 [["field_a"], 2009 ["004"], 2010 ["072"]]; 2011 2012 string[][] data1x200ExpectedBernoulliCompatV333P03 = 2013 [["field_a"], 2014 ["004"], 2015 ["072"], 2016 ["181"]]; 2017 2018 /* Combo 2, for bernoulli skip sampling: 3x0, 3x1, 1x200, empty, 1x10. No data files, 2019 * only expected results. The header is from 3x0, the results are offset 1-position 2020 * from data1x200ExpectedBernoulliSkipV333P03 due to insertion of a single preceding line. 2021 */ 2022 string[][] combo2ExpectedBernoulliSkipV333P03 = 2023 [["field_a", "field_b", "field_c"], 2024 ["024"], 2025 ["038"], 2026 ["081"], 2027 ["106"], 2028 ["107"], 2029 ["121"], 2030 ["135"], 2031 ["165"], 2032 ["181"]]; 2033 2034 2035 /* 1x10 - Simple 1-column file. */ 2036 string[][] data1x10 = 2037 [["field_a"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"], ["10"]]; 2038 string fpath_data1x10 = buildPath(testDir, "data1x10.tsv"); 2039 string fpath_data1x10_noheader = buildPath(testDir, "data1x10_noheader.tsv"); 2040 writeUnittestTsvFile(fpath_data1x10, data1x10); 2041 writeUnittestTsvFile(fpath_data1x10_noheader, data1x10[1..$]); 2042 2043 string[][] data1x10ExpectedPermuteCompat = 2044 [["field_a"], ["8"], ["4"], ["6"], ["5"], ["3"], ["10"], ["7"], ["9"], ["2"], ["1"]]; 2045 2046 string[][] data1x10ExpectedPermuteWt1 = 2047 [["field_a"], ["8"], ["4"], ["6"], ["10"], ["5"], ["7"], ["9"], ["3"], ["2"], ["1"]]; 2048 2049 /* 2x10a - Uniform distribution [0,1]. */ 2050 string[][] data2x10a = 2051 [["line", "weight"], 2052 ["1", "0.26788837"], 2053 ["2", "0.06601298"], 2054 ["3", "0.38627527"], 2055 ["4", "0.47379424"], 2056 ["5", "0.02966641"], 2057 ["6", "0.05636231"], 2058 ["7", "0.70529242"], 2059 ["8", "0.91836862"], 2060 ["9", "0.99103720"], 2061 ["10", "0.31401740"]]; 2062 2063 string fpath_data2x10a = buildPath(testDir, "data2x10a.tsv"); 2064 writeUnittestTsvFile(fpath_data2x10a, data2x10a); 2065 2066 string[][] data2x10aExpectedPermuteWt2Probs = 2067 [["random_value", "line", "weight"], 2068 ["0.96833865494543658", "8", "0.91836862"], 2069 ["0.91856842054413923", "4", "0.47379424"], 2070 ["0.25730832087795091", "7", "0.70529242"], 2071 ["0.2372531790701812", "9", "0.99103720"], 2072 ["0.16016096701872204", "3", "0.38627527"], 2073 ["0.090819662667243381", "10", "0.31401740"], 2074 ["0.0071764539244361172", "6", "0.05636231"], 2075 ["4.8318642951630057e-08", "1", "0.26788837"], 2076 ["3.7525692966535517e-10", "5", "0.02966641"], 2077 ["8.2123247880095796e-13", "2", "0.06601298"]]; 2078 2079 /* 2x10b - Uniform distribution [0,1000]. */ 2080 string[][] data2x10b = 2081 [["line", "weight"], 2082 ["1", "761"], 2083 ["2", "432"], 2084 ["3", "103"], 2085 ["4", "448"], 2086 ["5", "750"], 2087 ["6", "711"], 2088 ["7", "867"], 2089 ["8", "841"], 2090 ["9", "963"], 2091 ["10", "784"]]; 2092 2093 string fpath_data2x10b = buildPath(testDir, "data2x10b.tsv"); 2094 writeUnittestTsvFile(fpath_data2x10b, data2x10b); 2095 2096 string[][] data2x10bExpectedPermuteWt2Probs = 2097 [["random_value", "line", "weight"], 2098 ["0.99996486739067969", "8", "841"], 2099 ["0.99991017467137211", "4", "448"], 2100 ["0.99960871524873662", "6", "711"], 2101 ["0.999141885371438", "5", "750"], 2102 ["0.99903963250274785", "10", "784"], 2103 ["0.99889631825931946", "7", "867"], 2104 ["0.99852058315191139", "9", "963"], 2105 ["0.99575669679158918", "2", "432"], 2106 ["0.99408758732050595", "1", "761"], 2107 ["0.99315467761212362", "3", "103"]]; 2108 2109 /* 2x10c - Logarithmic distribution in random order. */ 2110 string[][] data2x10c = 2111 [["line", "weight"], 2112 ["1", "31.85"], 2113 ["2", "17403.31"], 2114 ["3", "653.84"], 2115 ["4", "8.23"], 2116 ["5", "2671.04"], 2117 ["6", "26226.08"], 2118 ["7", "1.79"], 2119 ["8", "354.56"], 2120 ["9", "35213.81"], 2121 ["10", "679.29"]]; 2122 2123 string fpath_data2x10c = buildPath(testDir, "data2x10c.tsv"); 2124 writeUnittestTsvFile(fpath_data2x10c, data2x10c); 2125 2126 string[][] data2x10cExpectedPermuteWt2Probs = 2127 [["random_value", "line", "weight"], 2128 ["0.99998939008709697", "6", "26226.08"], 2129 ["0.99995951291695517", "9", "35213.81"], 2130 ["0.99991666907613541", "8", "354.56"], 2131 ["0.9998944505218641", "2", "17403.31"], 2132 ["0.9997589760286163", "5", "2671.04"], 2133 ["0.99891852769877643", "3", "653.84"], 2134 ["0.99889167752782515", "10", "679.29"], 2135 ["0.99512207506850148", "4", "8.23"], 2136 ["0.86789371584259023", "1", "31.85"], 2137 ["0.5857443816291561", "7", "1.79"]]; 2138 2139 /* 2x10d. Logarithmic distribution in ascending order. */ 2140 string[][] data2x10d = 2141 [["line", "weight"], 2142 ["1", "1.79"], 2143 ["2", "8.23"], 2144 ["3", "31.85"], 2145 ["4", "354.56"], 2146 ["5", "653.84"], 2147 ["6", "679.29"], 2148 ["7", "2671.04"], 2149 ["8", "17403.31"], 2150 ["9", "26226.08"], 2151 ["10", "35213.81"]]; 2152 2153 string fpath_data2x10d = buildPath(testDir, "data2x10d.tsv"); 2154 writeUnittestTsvFile(fpath_data2x10d, data2x10d); 2155 2156 string[][] data2x10dExpectedPermuteWt2Probs = 2157 [["random_value", "line", "weight"], 2158 ["0.99999830221846353", "8", "17403.31"], 2159 ["0.99997860834041397", "10", "35213.81"], 2160 ["0.99994563828986716", "9", "26226.08"], 2161 ["0.99988650363575737", "4", "354.56"], 2162 ["0.99964161939190088", "7", "2671.04"], 2163 ["0.99959045338948649", "6", "679.29"], 2164 ["0.99901574490639788", "5", "653.84"], 2165 ["0.97803163304747431", "3", "31.85"], 2166 ["0.79994791806910948", "2", "8.23"], 2167 ["0.080374261239949119", "1", "1.79"]]; 2168 2169 /* 2x10e. Logarithmic distribution in descending order. */ 2170 string[][] data2x10e = 2171 [["line", "weight"], 2172 ["1", "35213.81"], 2173 ["2", "26226.08"], 2174 ["3", "17403.31"], 2175 ["4", "2671.04"], 2176 ["5", "679.29"], 2177 ["6", "653.84"], 2178 ["7", "354.56"], 2179 ["8", "31.85"], 2180 ["9", "8.23"], 2181 ["10", "1.79"]]; 2182 string fpath_data2x10e = buildPath(testDir, "data2x10e.tsv"); 2183 writeUnittestTsvFile(fpath_data2x10e, data2x10e); 2184 2185 string[][] data2x10eExpectedPermuteWt2Probs = 2186 [["random_value", "line", "weight"], 2187 ["0.99998493348975237", "4", "2671.04"], 2188 ["0.99995934807202624", "3", "17403.31"], 2189 ["0.99992995739727453", "2", "26226.08"], 2190 ["0.99987185679245649", "1", "35213.81"], 2191 ["0.99957451563173938", "6", "653.84"], 2192 ["0.99907273650209583", "8", "31.85"], 2193 ["0.99905260312968946", "5", "679.29"], 2194 ["0.99730333650516401", "7", "354.56"], 2195 ["0.84093902435227808", "9", "8.23"], 2196 ["0.65650015926290028", "10", "1.79"]]; 2197 2198 /* Data sets for distinct sampling. */ 2199 string[][] data5x25 = 2200 [["ID", "Shape", "Color", "Size", "Weight"], 2201 ["01", "circle", "red", "S", "10"], 2202 ["02", "circle", "black", "L", "20"], 2203 ["03", "square", "black", "L", "20"], 2204 ["04", "circle", "green", "L", "30"], 2205 ["05", "ellipse", "red", "S", "20"], 2206 ["06", "triangle", "red", "S", "10"], 2207 ["07", "triangle", "red", "L", "20"], 2208 ["08", "square", "black", "S", "10"], 2209 ["09", "circle", "black", "S", "20"], 2210 ["10", "square", "green", "L", "20"], 2211 ["11", "triangle", "red", "L", "20"], 2212 ["12", "circle", "green", "L", "30"], 2213 ["13", "ellipse", "red", "S", "20"], 2214 ["14", "circle", "green", "L", "30"], 2215 ["15", "ellipse", "red", "L", "30"], 2216 ["16", "square", "red", "S", "10"], 2217 ["17", "circle", "black", "L", "20"], 2218 ["18", "square", "red", "S", "20"], 2219 ["19", "square", "black", "L", "20"], 2220 ["20", "circle", "red", "S", "10"], 2221 ["21", "ellipse", "black", "L", "30"], 2222 ["22", "triangle", "red", "L", "30"], 2223 ["23", "circle", "green", "S", "20"], 2224 ["24", "square", "green", "L", "20"], 2225 ["25", "circle", "red", "S", "10"], 2226 ]; 2227 2228 string fpath_data5x25 = buildPath(testDir, "data5x25.tsv"); 2229 string fpath_data5x25_noheader = buildPath(testDir, "data5x25_noheader.tsv"); 2230 writeUnittestTsvFile(fpath_data5x25, data5x25); 2231 writeUnittestTsvFile(fpath_data5x25_noheader, data5x25[1..$]); 2232 2233 string[][] data5x25ExpectedDistinctK2P40 = 2234 [["ID", "Shape", "Color", "Size", "Weight"], 2235 ["03", "square", "black", "L", "20"], 2236 ["05", "ellipse", "red", "S", "20"], 2237 ["08", "square", "black", "S", "10"], 2238 ["10", "square", "green", "L", "20"], 2239 ["13", "ellipse", "red", "S", "20"], 2240 ["15", "ellipse", "red", "L", "30"], 2241 ["16", "square", "red", "S", "10"], 2242 ["18", "square", "red", "S", "20"], 2243 ["19", "square", "black", "L", "20"], 2244 ["21", "ellipse", "black", "L", "30"], 2245 ["24", "square", "green", "L", "20"], 2246 ]; 2247 2248 string[][] data5x25ExpectedDistinctK2K4P20 = 2249 [["ID", "Shape", "Color", "Size", "Weight"], 2250 ["03", "square", "black", "L", "20"], 2251 ["07", "triangle", "red", "L", "20"], 2252 ["08", "square", "black", "S", "10"], 2253 ["10", "square", "green", "L", "20"], 2254 ["11", "triangle", "red", "L", "20"], 2255 ["16", "square", "red", "S", "10"], 2256 ["18", "square", "red", "S", "20"], 2257 ["19", "square", "black", "L", "20"], 2258 ["22", "triangle", "red", "L", "30"], 2259 ["24", "square", "green", "L", "20"], 2260 ]; 2261 2262 string[][] data5x25ExpectedDistinctK2K3K4P20 = 2263 [["ID", "Shape", "Color", "Size", "Weight"], 2264 ["04", "circle", "green", "L", "30"], 2265 ["07", "triangle", "red", "L", "20"], 2266 ["09", "circle", "black", "S", "20"], 2267 ["11", "triangle", "red", "L", "20"], 2268 ["12", "circle", "green", "L", "30"], 2269 ["14", "circle", "green", "L", "30"], 2270 ["16", "square", "red", "S", "10"], 2271 ["18", "square", "red", "S", "20"], 2272 ["22", "triangle", "red", "L", "30"], 2273 ]; 2274 2275 /* 2276 * Enough setup! Actually run some tests! 2277 */ 2278 2279 /* Permutations. Headers, static seed, compatibility mode. With weights and without. */ 2280 testTsvSample(["test-a1", "--header", "--static-seed", "--compatibility-mode", fpath_dataEmpty], dataEmpty); 2281 testTsvSample(["test-a2", "--header", "--static-seed", "--compatibility-mode", fpath_data3x0], data3x0); 2282 testTsvSample(["test-a3", "-H", "-s", "--compatibility-mode", fpath_data3x1], data3x1); 2283 testTsvSample(["test-a4", "-H", "-s", "--compatibility-mode", fpath_data3x2], data3x2PermuteCompat); 2284 testTsvSample(["test-a5", "-H", "-s", "--compatibility-mode", fpath_data3x3], data3x3ExpectedPermuteCompat); 2285 testTsvSample(["test-a6", "-H", "-s", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat); 2286 testTsvSample(["test-a7", "-H", "-s", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs); 2287 testTsvSample(["test-a8", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3); 2288 testTsvSample(["test-a9", "-H", "-s", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 2289 testTsvSample(["test-a10", "-H", "--seed-value", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); 2290 testTsvSample(["test-a11", "-H", "-s", "-v", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); 2291 testTsvSample(["test-a12", "-H", "-s", "-v", "0", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs); 2292 testTsvSample(["test-a13", "-H", "-v", "41", "-w", "3", "--print-random", fpath_data3x6], data3x6ExpectedPermuteWt3V41Probs); 2293 2294 /* Permutations, without compatibility mode, or with both compatibility and printing. */ 2295 testTsvSample(["test-aa1", "--header", "--static-seed", fpath_dataEmpty], dataEmpty); 2296 testTsvSample(["test-aa2", "--header", "--static-seed", fpath_data3x0], data3x0); 2297 testTsvSample(["test-aa3", "-H", "-s", fpath_data3x1], data3x1); 2298 testTsvSample(["test-aa4", "-H", "-s", fpath_data3x2], data3x2PermuteShuffle); 2299 testTsvSample(["test-aa5", "-H", "-s", fpath_data3x3], data3x3ExpectedPermuteSwap); 2300 testTsvSample(["test-aa6", "-H", "-s", fpath_data3x6], data3x6ExpectedPermuteSwap); 2301 testTsvSample(["test-aa7", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3); 2302 testTsvSample(["test-aa8", "-H", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 2303 testTsvSample(["test-aa9", "-H", "--seed-value", "41", "--print-random", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); 2304 2305 /* Reservoir sampling using Algorithm R. 2306 * (Note: reservoirSamplingViaHeap is tested later in the length-based iteration loops.) 2307 */ 2308 testTsvSample(["test-aa10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty); 2309 testTsvSample(["test-aa11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty); 2310 testTsvSample(["test-aa12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x0], data3x0); 2311 testTsvSample(["test-aa13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x0], data3x0); 2312 testTsvSample(["test-aa14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x1], data3x1); 2313 testTsvSample(["test-aa15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x1], data3x1); 2314 testTsvSample(["test-aa16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum6); 2315 testTsvSample(["test-aa17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum6); 2316 testTsvSample(["test-aa18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum5); 2317 testTsvSample(["test-aa19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum4); 2318 testTsvSample(["test-aa20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum3); 2319 testTsvSample(["test-aa21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum2); 2320 testTsvSample(["test-aa22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum1); 2321 2322 /* Bernoulli sampling cases. */ 2323 testTsvSample(["test-a14", "--header", "--static-seed", "--prob", "0.001", fpath_dataEmpty], dataEmpty); 2324 testTsvSample(["test-a15", "--header", "--static-seed", "--prob", "0.001", fpath_data3x0], data3x0); 2325 testTsvSample(["test-a16", "-H", "-s", "-p", "1.0", fpath_data3x1], data3x1); 2326 testTsvSample(["test-a17", "-H", "-s", "-p", "1.0", fpath_data3x6], data3x6); 2327 testTsvSample(["test-a18", "-H", "-p", "1.0", fpath_data3x6], data3x6); 2328 testTsvSample(["test-a19", "-H", "-s", "--prob", "1.0", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 2329 testTsvSample(["test-a20", "-H", "-s", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60); 2330 testTsvSample(["test-a21", "-H", "-s", "--prob", "0.60", fpath_data3x6], data3x6ExpectedBernoulliCompatP60); 2331 testTsvSample(["test-a22", "-H", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatP60V41Probs); 2332 2333 /* Bernoulli sampling with probabilities in skip sampling range or preferring skip sampling. */ 2334 testTsvSample(["test-ab1", "-H", "--seed-value", "333", "--prob", "0.01", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P01); 2335 testTsvSample(["test-ab2", "-H", "--seed-value", "333", "--prob", "0.02", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P02); 2336 testTsvSample(["test-ab3", "-H", "--seed-value", "333", "--prob", "0.03", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03); 2337 testTsvSample(["test-ab4", "-H", "--seed-value", "333", "--prob", "0.01", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P01); 2338 testTsvSample(["test-ab5", "-H", "--seed-value", "333", "--prob", "0.02", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P02); 2339 testTsvSample(["test-ab6", "-H", "--seed-value", "333", "--prob", "0.03", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P03); 2340 testTsvSample(["test-ab7", "-H", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6], data3x6ExpectedBernoulliSkipP40); 2341 2342 /* Distinct sampling cases. */ 2343 testTsvSample(["test-a23", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_dataEmpty], dataEmpty); 2344 testTsvSample(["test-a24", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_data3x0], data3x0); 2345 testTsvSample(["test-a25", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x1], data3x1); 2346 testTsvSample(["test-a26", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x6], data3x6); 2347 testTsvSample(["test-a27", "-H", "-s", "-p", "0.6", "-k", "1,3", fpath_data3x6], data3x6ExpectedDistinctK1K3P60); 2348 2349 /* Generating random weights. Use Bernoulli sampling test set at prob 100% for uniform sampling. 2350 * For weighted sampling, use the weighted cases, but with expected using the original ordering. 2351 */ 2352 testTsvSample(["test-a28", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 2353 testTsvSample(["test-a29", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 2354 testTsvSample(["test-a30", "-H", "-s", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6], 2355 data3x6ExpectedWt3ProbsInorder); 2356 testTsvSample(["test-a31", "-H", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6], 2357 data3x6ExpectedWt3V41ProbsInorder); 2358 testTsvSample(["test-a32", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6], 2359 data3x6ExpectedDistinctK1K3P60Probs); 2360 testTsvSample(["test-a33", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", "--random-value-header", 2361 "custom_random_value_header", fpath_data3x6], data3x6ExpectedDistinctK1K3P60ProbsRVCustom); 2362 testTsvSample(["test-a34", "-H", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6], 2363 data3x6ExpectedDistinctK2P2ProbsInorder); 2364 2365 /* Simple random sampling with replacement. */ 2366 testTsvSample(["test-a35", "-H", "-s", "--replace", fpath_dataEmpty], dataEmpty); 2367 testTsvSample(["test-a36", "-H", "-s", "--replace", "--num", "3", fpath_dataEmpty], dataEmpty); 2368 testTsvSample(["test-a37", "-H", "-s", "--replace", fpath_data3x0], data3x0); 2369 testTsvSample(["test-a38", "-H", "-s", "--replace", "--num", "3", fpath_data3x0], data3x0); 2370 testTsvSample(["test-a39", "-H", "-s", "--replace", "--num", "3", fpath_data3x1], data3x1ExpectedReplaceNum3); 2371 testTsvSample(["test-a40", "-H", "-s", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10); 2372 testTsvSample(["test-a41", "-H", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10V77); 2373 2374 /* Permutations, compatibility mode, without headers. */ 2375 testTsvSample(["test-b1", "-s", "--compatibility-mode", fpath_data3x1_noheader], data3x1[1..$]); 2376 testTsvSample(["test-b2", "-s", "--compatibility-mode", fpath_data3x2_noheader], data3x2PermuteCompat[1..$]); 2377 testTsvSample(["test-b3", "-s", "--compatibility-mode", fpath_data3x3_noheader], data3x3ExpectedPermuteCompat[1..$]); 2378 testTsvSample(["test-b4", "-s", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1..$]); 2379 testTsvSample(["test-b5", "-s", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1..$]); 2380 testTsvSample(["test-b6", "-s", "--weight-field", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..$]); 2381 testTsvSample(["test-b7", "-s", "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..$]); 2382 testTsvSample(["test-b8", "-v", "41", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1..$]); 2383 testTsvSample(["test-b9", "-v", "41", "-w", "3", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3V41Probs[1..$]); 2384 2385 /* Permutations, no headers, without compatibility mode, or with printing and compatibility mode. */ 2386 testTsvSample(["test-bb1", "-s", fpath_data3x1_noheader], data3x1[1..$]); 2387 testTsvSample(["test-bb2", "-s", fpath_data3x2_noheader], data3x2PermuteShuffle[1..$]); 2388 testTsvSample(["test-bb3", "-s", fpath_data3x3_noheader], data3x3ExpectedPermuteSwap[1..$]); 2389 testTsvSample(["test-bb4", "-s", fpath_data3x6_noheader], data3x6ExpectedPermuteSwap[1..$]); 2390 testTsvSample(["test-bb5", "-s", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..$]); 2391 testTsvSample(["test-bb6", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..$]); 2392 testTsvSample(["test-bb7", "-v", "41", "--print-random", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1..$]); 2393 2394 /* Reservoir sampling using Algorithm R, no headers. */ 2395 testTsvSample(["test-aa10", "--prefer-algorithm-r", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty); 2396 testTsvSample(["test-aa11", "--prefer-algorithm-r", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty); 2397 testTsvSample(["test-aa14", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x1_noheader], data3x1[1..$]); 2398 testTsvSample(["test-aa15", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x1_noheader], data3x1[1..$]); 2399 testTsvSample(["test-aa16", "--prefer-algorithm-r", "-s", "--num", "7", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum6[1..$]); 2400 testTsvSample(["test-aa17", "--prefer-algorithm-r", "-s", "--num", "6", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum6[1..$]); 2401 testTsvSample(["test-aa18", "--prefer-algorithm-r", "-s", "--num", "5", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum5[1..$]); 2402 testTsvSample(["test-aa19", "--prefer-algorithm-r", "-s", "--num", "4", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum4[1..$]); 2403 testTsvSample(["test-aa20", "--prefer-algorithm-r", "-s", "--num", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum3[1..$]); 2404 testTsvSample(["test-aa21", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum2[1..$]); 2405 testTsvSample(["test-aa22", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum1[1..$]); 2406 2407 /* Bernoulli sampling cases. */ 2408 testTsvSample(["test-b10", "-s", "-p", "1.0", fpath_data3x1_noheader], data3x1[1..$]); 2409 testTsvSample(["test-b11", "-s", "-p", "1.0", fpath_data3x6_noheader], data3x6[1..$]); 2410 testTsvSample(["test-b12", "-p", "1.0", fpath_data3x6_noheader], data3x6[1..$]); 2411 testTsvSample(["test-b13", "-s", "--prob", "1.0", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..$]); 2412 testTsvSample(["test-b14", "-s", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1..$]); 2413 testTsvSample(["test-b15", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60V41Probs[1..$]); 2414 2415 /* Bernoulli sampling with probabilities in skip sampling range. */ 2416 testTsvSample(["test-bb1", "-v", "333", "-p", "0.01", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P01[1..$]); 2417 testTsvSample(["test-bb2", "-v", "333", "-p", "0.02", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P02[1..$]); 2418 testTsvSample(["test-bb3", "-v", "333", "-p", "0.03", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..$]); 2419 testTsvSample(["test-bb4", "-v", "333", "-p", "0.01", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P01[1..$]); 2420 testTsvSample(["test-bb5", "-v", "333", "-p", "0.02", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P02[1..$]); 2421 testTsvSample(["test-bb6", "-v", "333", "-p", "0.03", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P03[1..$]); 2422 testTsvSample(["test-bb7", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6_noheader], data3x6ExpectedBernoulliSkipP40[1..$]); 2423 2424 /* Distinct sampling cases. */ 2425 testTsvSample(["test-b16", "-s", "-p", "1.0", "-k", "2", fpath_data3x1_noheader], data3x1[1..$]); 2426 testTsvSample(["test-b17", "-s", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]); 2427 testTsvSample(["test-b18", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]); 2428 testTsvSample(["test-b19", "-v", "71563", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]); 2429 2430 /* Generating random weights. Reuse Bernoulli sampling tests at prob 100%. */ 2431 testTsvSample(["test-b20", "-s", "--gen-random-inorder", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..$]); 2432 testTsvSample(["test-b23", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedWt3V41ProbsInorder[1..$]); 2433 testTsvSample(["test-b24", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6_noheader], 2434 data3x6ExpectedDistinctK1K3P60Probs[1..$]); 2435 testTsvSample(["test-b24", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6_noheader], 2436 data3x6ExpectedDistinctK2P2ProbsInorder[1..$]); 2437 2438 /* Simple random sampling with replacement. */ 2439 testTsvSample(["test-b25", "-s", "--replace", fpath_dataEmpty], dataEmpty); 2440 testTsvSample(["test-b26", "-s", "-r", "--num", "3", fpath_dataEmpty], dataEmpty); 2441 testTsvSample(["test-b27", "-s", "-r", "-n", "3", fpath_data3x1_noheader], data3x1ExpectedReplaceNum3[1..$]); 2442 testTsvSample(["test-b28", "-s", "--replace", "-n", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10[1..$]); 2443 testTsvSample(["test-b29", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10V77[1..$]); 2444 2445 /* Multi-file tests. */ 2446 testTsvSample(["test-c1", "--header", "--static-seed", "--compatibility-mode", 2447 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2448 combo1ExpectedPermuteCompat); 2449 testTsvSample(["test-c2", "--header", "--static-seed", "--print-random", 2450 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2451 combo1ExpectedPermuteCompatProbs); 2452 testTsvSample(["test-c3", "--header", "--static-seed", "--print-random", "--weight-field", "3", 2453 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2454 combo1ExpectedPermuteWt3Probs); 2455 testTsvSample(["test-c4", "--header", "--static-seed", "--weight-field", "3", "--compatibility-mode", 2456 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2457 combo1ExpectedPermuteWt3); 2458 testTsvSample(["test-c5", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4", 2459 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2460 combo1ExpectedPermuteAlgoRNum4); 2461 2462 /* Multi-file, no headers. */ 2463 testTsvSample(["test-c6", "--static-seed", "--compatibility-mode", 2464 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2465 fpath_data3x6_noheader, fpath_data3x2_noheader], 2466 combo1ExpectedPermuteCompat[1..$]); 2467 testTsvSample(["test-c7", "--static-seed", "--print-random", 2468 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2469 fpath_data3x6_noheader, fpath_data3x2_noheader], 2470 combo1ExpectedPermuteCompatProbs[1..$]); 2471 testTsvSample(["test-c8", "--static-seed", "--print-random", "--weight-field", "3", 2472 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2473 fpath_data3x6_noheader, fpath_data3x2_noheader], 2474 combo1ExpectedPermuteWt3Probs[1..$]); 2475 testTsvSample(["test-c9", "--static-seed", "--weight-field", "3", "--compatibility-mode", 2476 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2477 fpath_data3x6_noheader, fpath_data3x2_noheader], 2478 combo1ExpectedPermuteWt3[1..$]); 2479 testTsvSample(["test-c10", "--static-seed", "--prefer-algorithm-r", "--num", "4", 2480 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2481 fpath_data3x6_noheader, fpath_data3x2_noheader], 2482 combo1ExpectedPermuteAlgoRNum4[1..$]); 2483 2484 /* Bernoulli sampling cases. */ 2485 testTsvSample(["test-c11", "--header", "--static-seed", "--print-random", "--prob", ".5", 2486 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2487 combo1ExpectedBernoulliCompatP50Probs); 2488 testTsvSample(["test-c12", "--header", "--static-seed", "--prob", ".4", 2489 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2490 combo1ExpectedBernoulliCompatP40); 2491 testTsvSample(["test-c13", "--static-seed", "--print-random", "--prob", ".5", 2492 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2493 fpath_data3x6_noheader, fpath_data3x2_noheader], 2494 combo1ExpectedBernoulliCompatP50Probs[1..$]); 2495 testTsvSample(["test-c14", "--static-seed", "--prob", ".4", 2496 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2497 fpath_data3x6_noheader, fpath_data3x2_noheader], 2498 combo1ExpectedBernoulliCompatP40[1..$]); 2499 2500 /* Bernoulli sampling with probabilities in skip sampling range. */ 2501 testTsvSample(["test-cc1", "-H", "-v", "333", "-p", "0.03", 2502 fpath_data3x0, fpath_data3x1, fpath_data1x200, fpath_dataEmpty, fpath_data1x10], 2503 combo2ExpectedBernoulliSkipV333P03); 2504 testTsvSample(["test-cc1", "-v", "333", "-p", "0.03", 2505 fpath_data3x1_noheader, fpath_data1x200_noheader, fpath_dataEmpty, fpath_data1x10_noheader], 2506 combo2ExpectedBernoulliSkipV333P03[1..$]); 2507 2508 /* Distinct sampling cases. */ 2509 testTsvSample(["test-c13", "--header", "--static-seed", "--key-fields", "1", "--prob", ".4", 2510 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2511 combo1ExpectedDistinctK1P40); 2512 testTsvSample(["test-c14", "--static-seed", "--key-fields", "1", "--prob", ".4", 2513 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2514 fpath_data3x6_noheader, fpath_data3x2_noheader], 2515 combo1ExpectedDistinctK1P40[1..$]); 2516 2517 /* Generating random weights. */ 2518 testTsvSample(["test-c15", "--header", "--static-seed", "--gen-random-inorder", 2519 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2520 combo1ExpectedProbsInorder); 2521 testTsvSample(["test-c16", "--static-seed", "--gen-random-inorder", 2522 fpath_data3x3_noheader, fpath_data3x1_noheader, 2523 fpath_dataEmpty, fpath_data3x6_noheader, fpath_data3x2_noheader], 2524 combo1ExpectedProbsInorder[1..$]); 2525 2526 /* Simple random sampling with replacement. */ 2527 testTsvSample(["test-c17", "--header", "--static-seed", "--replace", "--num", "10", 2528 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2529 combo1ExpectedReplaceNum10); 2530 2531 testTsvSample(["test-c18", "--static-seed", "--replace", "--num", "10", 2532 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2533 fpath_data3x6_noheader, fpath_data3x2_noheader], 2534 combo1ExpectedReplaceNum10[1..$]); 2535 2536 /* Single column file. */ 2537 testTsvSample(["test-d1", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat); 2538 testTsvSample(["test-d2", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat); 2539 2540 /* Distributions. */ 2541 testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10a], data2x10aExpectedPermuteWt2Probs); 2542 testTsvSample(["test-e2", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10b], data2x10bExpectedPermuteWt2Probs); 2543 testTsvSample(["test-e3", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10c], data2x10cExpectedPermuteWt2Probs); 2544 testTsvSample(["test-e4", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10d], data2x10dExpectedPermuteWt2Probs); 2545 testTsvSample(["test-e5", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10e], data2x10eExpectedPermuteWt2Probs); 2546 2547 /* Tests of subset sample (--n|num) field. 2548 * 2549 * Note: The way these tests are done ensures that subset length does not affect 2550 * output order. 2551 */ 2552 import std.algorithm : min; 2553 for (size_t n = data3x6.length + 2; n >= 1; n--) 2554 { 2555 /* reservoirSamplingViaHeap. 2556 */ 2557 size_t expectedLength = min(data3x6.length, n + 1); 2558 testTsvSample([format("test-f1_%d", n), "-s", "-n", n.to!string, 2559 "-H", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]); 2560 2561 testTsvSample([format("test-f2_%d", n), "-s", "-n", n.to!string, 2562 "-H", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]); 2563 2564 testTsvSample([format("test-f3_%d", n), "-s", "-n", n.to!string, 2565 "-H", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs[0..expectedLength]); 2566 2567 testTsvSample([format("test-f4_%d", n), "-s", "-n", n.to!string, 2568 "-H", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3[0..expectedLength]); 2569 2570 testTsvSample([format("test-f5_%d", n), "-s", "-n", n.to!string, 2571 "-H", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs[0..expectedLength]); 2572 2573 testTsvSample([format("test-f6_%d", n), "-s", "-n", n.to!string, 2574 fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1..expectedLength]); 2575 2576 testTsvSample([format("test-f7_%d", n), "-s", "-n", n.to!string, 2577 "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1..expectedLength]); 2578 2579 testTsvSample([format("test-f8_%d", n), "-s", "-n", n.to!string, 2580 "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..expectedLength]); 2581 2582 testTsvSample([format("test-f9_%d", n), "-s", "-n", n.to!string, 2583 "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..expectedLength]); 2584 2585 /* Bernoulli sampling. 2586 */ 2587 import std.algorithm : min; 2588 size_t sampleExpectedLength = min(expectedLength, data3x6ExpectedBernoulliCompatProbsP60.length); 2589 2590 testTsvSample([format("test-f10_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 2591 "-H", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60[0..sampleExpectedLength]); 2592 2593 testTsvSample([format("test-f11_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 2594 "-H", fpath_data3x6], data3x6ExpectedBernoulliCompatP60[0..sampleExpectedLength]); 2595 2596 testTsvSample([format("test-f12_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 2597 "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1..sampleExpectedLength]); 2598 2599 testTsvSample([format("test-f13_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 2600 fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60[1..sampleExpectedLength]); 2601 2602 /* Distinct Sampling. 2603 */ 2604 size_t distinctExpectedLength = min(expectedLength, data3x6ExpectedDistinctK1K3P60.length); 2605 2606 testTsvSample([format("test-f14_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string, 2607 "-H", fpath_data3x6], data3x6ExpectedDistinctK1K3P60[0..distinctExpectedLength]); 2608 2609 testTsvSample([format("test-f15_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string, 2610 fpath_data3x6_noheader], data3x6ExpectedDistinctK1K3P60[1..distinctExpectedLength]); 2611 2612 testTsvSample([format("test-f16_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string, 2613 "-H", fpath_data3x6], data3x6ExpectedBernoulliProbsP100[0..expectedLength]); 2614 2615 testTsvSample([format("test-f17_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string, 2616 fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..expectedLength]); 2617 } 2618 2619 /* Similar tests with the 1x10 data set. */ 2620 for (size_t n = data1x10.length + 2; n >= 1; n--) 2621 { 2622 size_t expectedLength = min(data1x10.length, n + 1); 2623 testTsvSample([format("test-g1_%d", n), "-s", "-n", n.to!string, 2624 "-H", fpath_data1x10], data1x10ExpectedPermuteCompat[0..expectedLength]); 2625 2626 testTsvSample([format("test-g2_%d", n), "-s", "-n", n.to!string, 2627 "-H", "-w", "1", fpath_data1x10], data1x10ExpectedPermuteWt1[0..expectedLength]); 2628 2629 testTsvSample([format("test-g3_%d", n), "-s", "-n", n.to!string, 2630 fpath_data1x10_noheader], data1x10ExpectedPermuteCompat[1..expectedLength]); 2631 2632 testTsvSample([format("test-g4_%d", n), "-s", "-n", n.to!string, 2633 "-w", "1", fpath_data1x10_noheader], data1x10ExpectedPermuteWt1[1..expectedLength]); 2634 } 2635 2636 /* Simple random sampling with replacement: ensure sample size doesn't change order. */ 2637 for (size_t n = data3x6ExpectedReplaceNum10.length - 1; n >= 1; n--) 2638 { 2639 testTsvSample([format("test-h1_%d", n), "-s", "--replace", "-n", n.to!string, "-H", fpath_data3x6], 2640 data3x6ExpectedReplaceNum10[0 .. n + 1]); 2641 2642 testTsvSample([format("test-h2_%d", n), "-s", "--replace", "-n", n.to!string, fpath_data3x6_noheader], 2643 data3x6ExpectedReplaceNum10[1 .. n + 1]); 2644 } 2645 2646 /* Bernoulli skip sampling. Test with lengths both greater than and less than expected. */ 2647 for (size_t n = data1x200ExpectedBernoulliSkipV333P03.length + 2; n >= 1; n--) 2648 { 2649 size_t expectedLength = min(data1x200ExpectedBernoulliSkipV333P03.length, n + 1); 2650 2651 testTsvSample([format("test-i1_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string, 2652 "-H", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03[0..expectedLength]); 2653 2654 testTsvSample([format("test-i2_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string, 2655 fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..expectedLength]); 2656 } 2657 2658 2659 /* Distinct sampling tests. */ 2660 testTsvSample(["test-j1", "--header", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25], 2661 data5x25ExpectedDistinctK2P40); 2662 2663 testTsvSample(["test-j2", "-H", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25], 2664 data5x25ExpectedDistinctK2K4P20); 2665 2666 testTsvSample(["test-j3", "-H", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25], 2667 data5x25ExpectedDistinctK2K3K4P20); 2668 2669 testTsvSample(["test-j4", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25_noheader], 2670 data5x25ExpectedDistinctK2P40[1..$]); 2671 2672 testTsvSample(["test-j5", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25_noheader], 2673 data5x25ExpectedDistinctK2K4P20[1..$]); 2674 2675 testTsvSample(["test-j6", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25_noheader], 2676 data5x25ExpectedDistinctK2K3K4P20[1..$]); 2677 }