1 /** 2 Command line tool for randomizing or sampling lines from input streams. Several 3 sampling methods are available, including simple random sampling, weighted random 4 sampling, Bernoulli sampling, and distinct sampling. 5 6 Copyright (c) 2017-2018, eBay Software Foundation 7 Initially written by Jon Degenhardt 8 9 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 10 */ 11 module tsv_utils.tsv_sample; 12 13 import std.range; 14 import std.stdio; 15 import std.typecons : tuple, Flag; 16 17 version(unittest) 18 { 19 // When running unit tests, use main from -main compiler switch. 20 } 21 else 22 { 23 int main(string[] cmdArgs) 24 { 25 /* When running in DMD code coverage mode, turn on report merging. */ 26 version(D_Coverage) version(DigitalMars) 27 { 28 import core.runtime : dmd_coverSetMerge; 29 dmd_coverSetMerge(true); 30 } 31 32 TsvSampleOptions cmdopt; 33 auto r = cmdopt.processArgs(cmdArgs); 34 if (!r[0]) return r[1]; 35 version(LDC_Profile) 36 { 37 import ldc.profile : resetAll; 38 resetAll(); 39 } 40 try 41 { 42 import tsv_utils.common.utils : BufferedOutputRange; 43 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 44 45 tsvSample(cmdopt, bufferedOutput); 46 } 47 catch (Exception exc) 48 { 49 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 50 return 1; 51 } 52 return 0; 53 } 54 } 55 56 auto helpText = q"EOS 57 Synopsis: tsv-sample [options] [file...] 58 59 Sample input lines or randomize their order. Several modes of operation 60 are available: 61 * Line order randomization (the default): All input lines are output in a 62 random order. All orderings are equally likely. 63 * Weighted line order randomization (--w|weight-field): Lines are selected 64 using weighted random sampling, with the weight taken from a field. 65 Lines are output in weighted selection order, reordering the lines. 66 * Sampling with replacement (--r|replace, --n|num): All input is read into 67 memory, then lines are repeatedly selected at random and written out. This 68 continues until --n|num samples are output. Lines can be selected multiple 69 times. Output continues forever if --n|num is zero or not specified. 70 * Bernoulli sampling (--p|prob): A random subset of lines is output based 71 on an inclusion probability. This is a streaming operation. A selection 72 decision is made on each line as is it read. Line order is not changed. 73 * Distinct sampling (--k|key-fields, --p|prob): Input lines are sampled 74 based on the values in the key field. A subset of the keys are chosen 75 based on the inclusion probability (a 'distinct' set of keys). All lines 76 with one of the selected keys are output. Line order is not changed. 77 78 The '--n|num' option limits the sample size produced. It speeds up line 79 order randomization and weighted sampling significantly. It is also used 80 to terminate sampling with replacement. 81 82 Use '--help-verbose' for detailed information. 83 84 Options: 85 EOS"; 86 87 auto helpTextVerbose = q"EOS 88 Synopsis: tsv-sample [options] [file...] 89 90 Sample input lines or randomize their order. Several modes of operation 91 are available: 92 * Line order randomization (the default): All input lines are output in a 93 random order. All orderings are equally likely. 94 * Weighted line order randomization (--w|weight-field): Lines are selected 95 using weighted random sampling, with the weight taken from a field. 96 Lines are output in weighted selection order, reordering the lines. 97 * Sampling with replacement (--r|replace, --n|num): All input is read into 98 memory, then lines are repeatedly selected at random and written out. This 99 continues until --n|num samples are output. Lines can be selected multiple 100 times. Output continues forever if --n|num is zero or not specified. 101 * Bernoulli sampling (--p|prob): A random subset of lines is output based 102 on an inclusion probability. This is a streaming operation. A selection 103 decision is made on each line as is it read. Lines order is not changed. 104 * Distinct sampling (--k|key-fields, --p|prob): Input lines are sampled 105 based on the values in the key field. A subset of the keys are chosen 106 based on the inclusion probability (a 'distinct' set of keys). All lines 107 with one of the selected keys are output. Line order is not changed. 108 109 Sample size: The '--n|num' option limits the sample size produced. This 110 speeds up line order randomization and weighted sampling significantly 111 (details below). It is also used to terminate sampling with replacement. 112 113 Controlling the random seed: By default, each run produces a different 114 randomization or sampling. Using '--s|static-seed' changes this so 115 multiple runs produce the same results. This works by using the same 116 random seed each run. The random seed can be specified using 117 '--v|seed-value'. This takes a non-zero, 32-bit positive integer. (A zero 118 value is a no-op and ignored.) 119 120 Memory use: Bernoulli sampling and distinct sampling make decisions on 121 each line as it is read, so there is no memory accumulation. These 122 algorithms support arbitrary size inputs. Sampling with replacement reads 123 all lines into memory and is limited by available memory. The line order 124 randomization algorithms hold the full output set in memory prior to 125 generating results. This ultimately limits the size of the output set. For 126 these memory needs can be reduced by using a sample size (--n|num). This 127 engages reservior sampling. Output order is not affected. Both 128 'tsv-sample -n 1000' and 'tsv-sample | head -n 1000' produce the same 129 results, but the former is quite a bit faster. 130 131 Weighted sampling: Weighted random sampling is done using an algorithm 132 described by Pavlos Efraimidis and Paul Spirakis. Weights should be 133 positive values representing the relative weight of the entry in the 134 collection. Counts and similar can be used as weights, it is *not* 135 necessary to normalize to a [0,1] interval. Negative values are not 136 meaningful and given the value zero. Input order is not retained, instead 137 lines are output ordered by the randomized weight that was assigned. This 138 means that a smaller valid sample can be produced by taking the first N 139 lines of output. For more info on the sampling approach see: 140 * Wikipedia: https://en.wikipedia.org/wiki/Reservoir_sampling 141 * "Weighted Random Sampling over Data Streams", Pavlos S. Efraimidis 142 (https://arxiv.org/abs/1012.0256) 143 144 Printing random values: Most of the sampling algorithms work by generating 145 a random value for each line. (See "Compatibility mode" below.) The nature 146 of these values depends on the sampling algorithm. They are used for both 147 line selection and output ordering. The '--p|print-random' option can be 148 used to print these values. The random value is prepended to the line 149 separated by the --d|delimiter char (TAB by default). The 150 '--q|gen-random-inorder' option takes this one step further, generating 151 random values for all input lines without changing the input order. The 152 types of values currently used by these sampling algorithms: 153 * Unweighted sampling: Uniform random value in the interval [0,1]. This 154 includes Bernoulli sampling and unweighted line order randomization. 155 * Weighted sampling: Value in the interval [0,1]. Distribution depends on 156 the values in the weight field. It is used as a partial ordering. 157 * Distinct sampling: An integer, zero and up, representing a selection 158 group. The inclusion probability determines the number of selection groups. 159 * Sampling with replacement: Random value printing is not supported. 160 161 The specifics behind these random values are subject to change in future 162 releases. 163 164 Compatibility mode: As described above, many of the sampling algorithms 165 assign a random value to each line. This is useful when printing random 166 values. It has another occasionally useful property: repeated runs with 167 the same static seed but different selection parameters are more 168 compatible with each other, as each line gets assigned the same random 169 value on every run. For example, if Bernoulli sampling is run with 170 '--prob 0.2 --static-seed', then run again with '--prob 0.3 --static-seed', 171 all the lines selected in the first run will be selected in the second. 172 This comes at a cost: in some cases there are faster algorithms that don't 173 preserve this property. By default, tsv-sample will use faster algorithms 174 when available. However, the '--compatibility-mode' option switches to 175 algorithms that assign a random value per line. Printing random values 176 also engages compatibility mode. 177 178 Options: 179 EOS"; 180 181 /** Container for command line options. 182 */ 183 struct TsvSampleOptions 184 { 185 string programName; /// Program name 186 string[] files; /// Input files 187 bool helpVerbose = false; /// --help-verbose 188 bool hasHeader = false; /// --H|header 189 size_t sampleSize = 0; /// --n|num - Size of the desired sample 190 double inclusionProbability = double.nan; /// --p|prob - Inclusion probability 191 size_t[] keyFields; /// --k|key-fields - Used with inclusion probability 192 size_t weightField = 0; /// --w|weight-field - Field holding the weight 193 bool srsWithReplacement = false; /// --r|replace 194 bool staticSeed = false; /// --s|static-seed 195 uint seedValueOptionArg = 0; /// --v|seed-value 196 bool printRandom = false; /// --print-random 197 bool genRandomInorder = false; /// --gen-random-inorder 198 string randomValueHeader = "random_value"; /// --random-value-header 199 bool compatibilityMode = false; /// --compatibility-mode 200 char delim = '\t'; /// --d|delimiter 201 bool versionWanted = false; /// --V|version 202 bool preferSkipSampling = false; /// --prefer-skip-sampling 203 bool preferAlgorithmR = false; /// --prefer-algorithm-r 204 bool hasWeightField = false; /// Derived. 205 bool useBernoulliSampling = false; /// Derived. 206 bool useDistinctSampling = false; /// Derived. 207 bool distinctKeyIsFullLine = false; /// Derived. True if '--k|key-fields 0' is specfied. 208 bool usingUnpredictableSeed = true; /// Derived from --static-seed, --seed-value 209 uint seed = 0; /// Derived from --static-seed, --seed-value 210 211 auto processArgs(ref string[] cmdArgs) 212 { 213 import std.algorithm : any, canFind, each; 214 import std.getopt; 215 import std.math : isNaN; 216 import std.path : baseName, stripExtension; 217 import std.typecons : Yes, No; 218 import tsv_utils.common.utils : makeFieldListOptionHandler; 219 220 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 221 222 try 223 { 224 arraySep = ","; // Use comma to separate values in command line options 225 auto r = getopt( 226 cmdArgs, 227 "help-verbose", " Print more detailed help.", &helpVerbose, 228 229 std.getopt.config.caseSensitive, 230 "H|header", " Treat the first line of each file as a header.", &hasHeader, 231 std.getopt.config.caseInsensitive, 232 233 "n|num", "NUM Maximim number of lines to output. All selected lines are output if not provided or zero.", &sampleSize, 234 "p|prob", "NUM Inclusion probability (0.0 < NUM <= 1.0). For Bernoulli sampling, the probability each line is selected output. For distinct sampling, the probability each unique key is selected for output.", &inclusionProbability, 235 236 "k|key-fields", "<field-list> Fields to use as key for distinct sampling. Use with '--p|prob'. Specify '--k|key-fields 0' to use the entire line as the key.", 237 keyFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero), 238 239 "w|weight-field", "NUM Field containing weights. All lines get equal weight if not provided or zero.", &weightField, 240 "r|replace", " Simple random sampling with replacement. Use --n|num to specify the sample size.", &srsWithReplacement, 241 "s|static-seed", " Use the same random seed every run.", &staticSeed, 242 243 std.getopt.config.caseSensitive, 244 "v|seed-value", "NUM Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg, 245 std.getopt.config.caseInsensitive, 246 247 "print-random", " Include the assigned random value (prepended) when writing output lines.", &printRandom, 248 "gen-random-inorder", " Output all lines with assigned random values prepended, no changes to the order of input.", &genRandomInorder, 249 "random-value-header", " Header to use with --print-random and --gen-random-inorder. Default: 'random_value'.", &randomValueHeader, 250 "compatibility-mode", " Turns on 'compatibility-mode'. Use --help-verbose for information.", &compatibilityMode, 251 252 "d|delimiter", "CHR Field delimiter.", &delim, 253 254 std.getopt.config.caseSensitive, 255 "V|version", " Print version information and exit.", &versionWanted, 256 std.getopt.config.caseInsensitive, 257 258 "prefer-skip-sampling", " (Internal) Prefer the skip-sampling algorithm for Bernoulli sampling. Used for testing and diagnostics.", 259 &preferSkipSampling, 260 261 "prefer-algorithm-r", " (Internal) Prefer Algorithm R for unweighted line order randomization. Used for testing and diagnostics.", 262 &preferAlgorithmR, 263 ); 264 265 if (r.helpWanted) 266 { 267 defaultGetoptPrinter(helpText, r.options); 268 return tuple(false, 0); 269 } 270 else if (helpVerbose) 271 { 272 defaultGetoptPrinter(helpTextVerbose, r.options); 273 return tuple(false, 0); 274 } 275 else if (versionWanted) 276 { 277 import tsv_utils.common.tsvutils_version; 278 writeln(tsvutilsVersionNotice("tsv-sample")); 279 return tuple(false, 0); 280 } 281 282 /* Derivations and validations. */ 283 if (weightField > 0) 284 { 285 hasWeightField = true; 286 weightField--; // Switch to zero-based indexes. 287 } 288 289 if (srsWithReplacement) 290 { 291 if (hasWeightField) 292 { 293 throw new Exception("Sampling with replacement (--r|replace) does not support wieghts (--w|weight-field)."); 294 } 295 else if (!inclusionProbability.isNaN) 296 { 297 throw new Exception("Sampling with replacement (--r|replace) cannot be used with probabilities (--p|prob)."); 298 } 299 else if (keyFields.length > 0) 300 { 301 throw new Exception("Sampling with replacement (--r|replace) cannot be used with distinct sampling (--k|key-fields)."); 302 } 303 else if (printRandom || genRandomInorder) 304 { 305 throw new Exception("Sampling with replacement (--r|replace) does not support random value printing (--print-random, --gen-random-inorder)."); 306 } 307 } 308 309 if (keyFields.length > 0) 310 { 311 /* Note: useDistinctSampling is set as part of the inclusion probability checks below. */ 312 313 if (inclusionProbability.isNaN) throw new Exception("--p|prob is required when using --k|key-fields."); 314 315 if (keyFields.length == 1 && keyFields[0] == 0) 316 { 317 distinctKeyIsFullLine = true; 318 } 319 else 320 { 321 if (keyFields.length > 1 && keyFields.any!(x => x == 0)) 322 { 323 throw new Exception("Whole line as key (--k|key-fields 0) cannot be combined with multiple fields."); 324 } 325 326 keyFields.each!((ref x) => --x); // Convert to zero-based indexing. 327 } 328 } 329 330 /* Inclusion probability (--p|prob) is used for both Bernoulli sampling and distinct sampling. */ 331 if (!inclusionProbability.isNaN) 332 { 333 if (inclusionProbability <= 0.0 || inclusionProbability > 1.0) 334 { 335 import std.format : format; 336 throw new Exception( 337 format("Invalid --p|prob option: %g. Must satisfy 0.0 < prob <= 1.0.", inclusionProbability)); 338 } 339 340 if (keyFields.length > 0) useDistinctSampling = true; 341 else useBernoulliSampling = true; 342 343 if (hasWeightField) throw new Exception("--w|weight-field and --p|prob cannot be used together."); 344 345 if (genRandomInorder && !useDistinctSampling) 346 { 347 throw new Exception("--q|gen-random-inorder and --p|prob can only be used together if --k|key-fields is also used."); 348 } 349 } 350 else if (genRandomInorder && !hasWeightField) 351 { 352 useBernoulliSampling = true; 353 } 354 355 if (randomValueHeader.length == 0 || randomValueHeader.canFind('\n') || 356 randomValueHeader.canFind(delim)) 357 { 358 throw new Exception("--randomValueHeader must be at least one character and not contain field delimiters or newlines."); 359 } 360 361 /* Random value printing implies compatibility-mode, otherwise user's selection is used. */ 362 if (printRandom || genRandomInorder) compatibilityMode = true; 363 364 /* Seed. */ 365 import std.random : unpredictableSeed; 366 367 usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0); 368 369 if (usingUnpredictableSeed) seed = unpredictableSeed; 370 else if (seedValueOptionArg != 0) seed = seedValueOptionArg; 371 else if (staticSeed) seed = 2438424139; 372 else assert(0, "Internal error, invalid seed option states."); 373 374 /* Assume remaining args are files. Use standard input if files were not provided. */ 375 files ~= (cmdArgs.length > 1) ? cmdArgs[1..$] : ["-"]; 376 cmdArgs.length = 1; 377 } 378 catch (Exception exc) 379 { 380 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 381 return tuple(false, 1); 382 } 383 return tuple(true, 0); 384 } 385 } 386 /** Invokes the appropriate sampling routine based on the command line arguments. 387 */ 388 void tsvSample(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 389 if (isOutputRange!(OutputRange, char)) 390 { 391 if (cmdopt.srsWithReplacement) 392 { 393 simpleRandomSamplingWithReplacement(cmdopt, outputStream); 394 } 395 else if (cmdopt.useBernoulliSampling) 396 { 397 bernoulliSamplingCommand(cmdopt, outputStream); 398 } 399 else if (cmdopt.useDistinctSampling) 400 { 401 if (cmdopt.genRandomInorder) distinctSampling!(Yes.generateRandomAll)(cmdopt, outputStream); 402 else distinctSampling!(No.generateRandomAll)(cmdopt, outputStream); 403 } 404 else if (cmdopt.genRandomInorder) 405 { 406 /* Note that the preceeding cases handle gen-random-inorder themselves (Bernoulli, 407 * Distinct), or don't handle it (SRS w/ Replacement). 408 */ 409 assert(cmdopt.hasWeightField); 410 generateWeightedRandomValuesInorder(cmdopt, outputStream); 411 } 412 else if (cmdopt.sampleSize != 0) 413 { 414 reservoirSamplingCommand(cmdopt, outputStream); 415 } 416 else 417 { 418 randomizeLinesCommand(cmdopt, outputStream); 419 } 420 } 421 422 /** Invokes the appropriate Bernoulli sampling routine based on the command line arguments. 423 * 424 * This routine selects the appropriate bernoulli sampling function and template 425 * instantiation to use based on the command line arguments. 426 * 427 * See the bernoulliSkipSampling routine for a discussion of the choices behind the 428 * skipSamplingProbabilityThreshold used here. 429 */ 430 void bernoulliSamplingCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 431 if (isOutputRange!(OutputRange, char)) 432 { 433 assert(!cmdopt.hasWeightField); 434 435 immutable double skipSamplingProbabilityThreshold = 0.04; 436 437 if (cmdopt.compatibilityMode || 438 (cmdopt.inclusionProbability > skipSamplingProbabilityThreshold && !cmdopt.preferSkipSampling)) 439 { 440 if (cmdopt.genRandomInorder) 441 { 442 bernoulliSampling!(Yes.generateRandomAll)(cmdopt, outputStream); 443 } 444 else 445 { 446 bernoulliSampling!(No.generateRandomAll)(cmdopt, outputStream); 447 } 448 } 449 else 450 { 451 bernoulliSkipSampling(cmdopt, outputStream); 452 } 453 } 454 455 /** Bernoulli sampling of lines on the input stream. 456 * 457 * Each input line is a assigned a random value and output if less than 458 * cmdopt.inclusionProbability. The order of the lines is not changed. 459 * 460 * This routine supports random value printing and gen-random-inorder value printing. 461 */ 462 void bernoulliSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange) 463 (TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 464 if (isOutputRange!(OutputRange, char)) 465 { 466 import std.random : Random = Mt19937, uniform01; 467 import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix; 468 469 static if (generateRandomAll) assert(cmdopt.genRandomInorder); 470 else assert(!cmdopt.genRandomInorder); 471 472 auto randomGenerator = Random(cmdopt.seed); 473 474 /* Process each line. */ 475 bool headerWritten = false; 476 size_t numLinesWritten = 0; 477 foreach (filename; cmdopt.files) 478 { 479 auto inputStream = (filename == "-") ? stdin : filename.File(); 480 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 481 { 482 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 483 if (fileLineNum == 1 && cmdopt.hasHeader) 484 { 485 if (!headerWritten) 486 { 487 static if (generateRandomAll) 488 { 489 outputStream.put(cmdopt.randomValueHeader); 490 outputStream.put(cmdopt.delim); 491 } 492 else if (cmdopt.printRandom) 493 { 494 outputStream.put(cmdopt.randomValueHeader); 495 outputStream.put(cmdopt.delim); 496 } 497 498 outputStream.put(line); 499 outputStream.put("\n"); 500 headerWritten = true; 501 } 502 } 503 else 504 { 505 double lineScore = uniform01(randomGenerator); 506 507 static if (generateRandomAll) 508 { 509 outputStream.formatRandomValue(lineScore); 510 outputStream.put(cmdopt.delim); 511 outputStream.put(line); 512 outputStream.put("\n"); 513 514 if (cmdopt.sampleSize != 0) 515 { 516 ++numLinesWritten; 517 if (numLinesWritten == cmdopt.sampleSize) return; 518 } 519 } 520 else if (lineScore < cmdopt.inclusionProbability) 521 { 522 if (cmdopt.printRandom) 523 { 524 outputStream.formatRandomValue(lineScore); 525 outputStream.put(cmdopt.delim); 526 } 527 outputStream.put(line); 528 outputStream.put("\n"); 529 530 if (cmdopt.sampleSize != 0) 531 { 532 ++numLinesWritten; 533 if (numLinesWritten == cmdopt.sampleSize) return; 534 } 535 } 536 } 537 } 538 } 539 } 540 541 /** bernoulliSkipSampling is an implementation of Bernoulli sampling using skips. 542 * 543 * Skip sampling works by skipping a random number of lines between selections. This 544 * can be faster than assigning a random value to each line when the inclusion 545 * probability is low, as it reduces the number of calls to the random number 546 * generator. Both the random number generator and the log() function as called when 547 * calculating the next skip size. These additional log() calls add up as the 548 * probability increases. 549 * 550 * Performance tests indicate the break-even point is about 4-5% (--prob 0.04) for 551 * file-oriented line sampling. This is obviously environment specific. In the 552 * environments this implementation has been tested in the perfmance improvements 553 * remain small, less than 7%, even with an inclusion probability as low as 0.0001. 554 * 555 * The algorithm does not assign random values to individual lines. This makes it 556 * incompatible with random value printing. It is not suitable for compatibility mode 557 * either. As an example, in compatibility mode a line selected with '--prob 0.2' should 558 * also be selected with '--prob 0.3' (assuming the same random seed). Skip sampling 559 * does not have this property. 560 * 561 * The algorithm for calculating the skip size has been described by multiple sources. 562 * There are two key variants depending on whether the total number of lines in the 563 * data set is known in advance. (This implementation does not know the total.) 564 * Useful references: 565 * $(LIST 566 * * Jeffrey Scott Vitter, "An Efficient Algorithm for Sequential Random Sampling", 567 * ACM Trans on Mathematical Software, 1987. On-line: 568 * http://www.ittc.ku.edu/~jsv/Papers/Vit87.RandomSampling.pdf 569 * * P.J. Haas, "Data-Stream Sampling: Basic Techniques and Results", from the book 570 * "Data Stream Management", Springer-Verlag, 2016. On-line: 571 * https://www.springer.com/cda/content/document/cda_downloaddocument/9783540286073-c2.pdf 572 * * Erik Erlandson, "Faster Random Samples With Gap Sampling", 2014. On-line: 573 * http://erikerlandson.github.io/blog/2014/09/11/faster-random-samples-with-gap-sampling/ 574 * ) 575 */ 576 void bernoulliSkipSampling(OutputRange)(TsvSampleOptions cmdopt, OutputRange outputStream) 577 if (isOutputRange!(OutputRange, char)) 578 { 579 import std.conv : to; 580 import std.math : log, trunc; 581 import std.random : Random = Mt19937, uniform01; 582 import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix; 583 584 assert(cmdopt.inclusionProbability > 0.0 && cmdopt.inclusionProbability < 1.0); 585 assert(!cmdopt.printRandom); 586 assert(!cmdopt.compatibilityMode); 587 588 auto randomGenerator = Random(cmdopt.seed); 589 590 immutable double discardRate = 1.0 - cmdopt.inclusionProbability; 591 immutable double logDiscardRate = log(discardRate); 592 593 /* Note: The '1.0 - uniform01(randomGenerator)' expression flips the half closed 594 * interval to (0.0, 1.0], excluding 0.0. 595 */ 596 size_t remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t; 597 598 /* Process each line. */ 599 bool headerWritten = false; 600 size_t numLinesWritten = 0; 601 foreach (filename; cmdopt.files) 602 { 603 auto inputStream = (filename == "-") ? stdin : filename.File(); 604 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 605 { 606 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 607 if (fileLineNum == 1 && cmdopt.hasHeader) 608 { 609 if (!headerWritten) 610 { 611 outputStream.put(line); 612 outputStream.put("\n"); 613 headerWritten = true; 614 } 615 } 616 else if (remainingSkips > 0) 617 { 618 --remainingSkips; 619 } 620 else 621 { 622 outputStream.put(line); 623 outputStream.put("\n"); 624 625 if (cmdopt.sampleSize != 0) 626 { 627 ++numLinesWritten; 628 if (numLinesWritten == cmdopt.sampleSize) return; 629 } 630 631 remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t; 632 } 633 } 634 } 635 } 636 637 /** Sample a subset of the unique values from the key fields. 638 * 639 * Distinct sampling is done by hashing the key and mapping the hash value into 640 * buckets matching the inclusion probability. Records having a key mapping to bucket 641 * zero are output. 642 */ 643 void distinctSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange) 644 (TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 645 if (isOutputRange!(OutputRange, char)) 646 { 647 import std.algorithm : splitter; 648 import std.conv : to; 649 import std.digest.murmurhash; 650 import std.math : lrint; 651 import tsv_utils.common.utils : InputFieldReordering, throwIfWindowsNewlineOnUnix; 652 653 static if (generateRandomAll) assert(cmdopt.genRandomInorder); 654 else assert(!cmdopt.genRandomInorder); 655 656 assert(cmdopt.keyFields.length > 0); 657 assert(0.0 < cmdopt.inclusionProbability && cmdopt.inclusionProbability <= 1.0); 658 659 static if (generateRandomAll) 660 { 661 import std.format : formatValue, singleSpec; 662 immutable randomValueFormatSpec = singleSpec("%d"); 663 } 664 665 immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys. 666 667 uint numBuckets = (1.0 / cmdopt.inclusionProbability).lrint.to!uint; 668 669 /* Create a mapping for the key fields. */ 670 auto keyFieldsReordering = cmdopt.distinctKeyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields); 671 672 /* Process each line. */ 673 bool headerWritten = false; 674 size_t numLinesWritten = 0; 675 foreach (filename; cmdopt.files) 676 { 677 auto inputStream = (filename == "-") ? stdin : filename.File(); 678 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 679 { 680 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 681 if (fileLineNum == 1 && cmdopt.hasHeader) 682 { 683 if (!headerWritten) 684 { 685 static if (generateRandomAll) 686 { 687 outputStream.put(cmdopt.randomValueHeader); 688 outputStream.put(cmdopt.delim); 689 } 690 else if (cmdopt.printRandom) 691 { 692 outputStream.put(cmdopt.randomValueHeader); 693 outputStream.put(cmdopt.delim); 694 } 695 696 outputStream.put(line); 697 outputStream.put("\n"); 698 headerWritten = true; 699 } 700 } 701 else 702 { 703 /* Murmurhash works by successively adding individual keys, then finalizing. 704 * Adding individual keys is simpler if the full-line-as-key and individual 705 * fields as keys cases are separated. 706 */ 707 auto hasher = MurmurHash3!32(cmdopt.seed); 708 709 if (cmdopt.distinctKeyIsFullLine) 710 { 711 hasher.put(cast(ubyte[]) line); 712 } 713 else 714 { 715 assert(keyFieldsReordering !is null); 716 717 /* Gather the key field values and assemble the key. */ 718 keyFieldsReordering.initNewLine; 719 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 720 { 721 keyFieldsReordering.processNextField(fieldIndex, fieldValue); 722 if (keyFieldsReordering.allFieldsFilled) break; 723 } 724 725 if (!keyFieldsReordering.allFieldsFilled) 726 { 727 import std.format : format; 728 throw new Exception( 729 format("Not enough fields in line. File: %s, Line: %s", 730 (filename == "-") ? "Standard Input" : filename, fileLineNum)); 731 } 732 733 foreach (count, key; keyFieldsReordering.outputFields.enumerate) 734 { 735 if (count > 0) hasher.put(delimArray); 736 hasher.put(cast(ubyte[]) key); 737 } 738 } 739 740 hasher.finish; 741 742 static if (generateRandomAll) 743 { 744 import std.conv : to; 745 outputStream.formatValue(hasher.get % numBuckets, randomValueFormatSpec); 746 outputStream.put(cmdopt.delim); 747 outputStream.put(line); 748 outputStream.put("\n"); 749 750 if (cmdopt.sampleSize != 0) 751 { 752 ++numLinesWritten; 753 if (numLinesWritten == cmdopt.sampleSize) return; 754 } 755 } 756 else if (hasher.get % numBuckets == 0) 757 { 758 if (cmdopt.printRandom) 759 { 760 outputStream.put('0'); 761 outputStream.put(cmdopt.delim); 762 } 763 outputStream.put(line); 764 outputStream.put("\n"); 765 766 if (cmdopt.sampleSize != 0) 767 { 768 ++numLinesWritten; 769 if (numLinesWritten == cmdopt.sampleSize) return; 770 } 771 } 772 } 773 } 774 } 775 } 776 777 /** Invokes the appropriate reservoir sampling routine based on the command line 778 * arguments. 779 * 780 * This routine selects the appropriate reservior sampling function and template 781 * instantiation to use based on the command line arguments. 782 * 783 * Reservoir sampling is used when a fixed size sample is being pulled from an input 784 * stream. Weighted and unweighted sampling is supported. These routines also 785 * randomize the order of the selected lines. This is consistent with line order 786 * randomization of the entire input stream (handled by randomizeLinesCommand). 787 * 788 * For unweighted sampling, there is a performance tradeoff choice between the two 789 * available implementations. See the reservoirSampling documentation for 790 * information. The threshold used here was chosen based on performance tests. 791 */ 792 793 void reservoirSamplingCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 794 if (isOutputRange!(OutputRange, char)) 795 { 796 assert(cmdopt.sampleSize != 0); 797 798 immutable size_t algorithmRSampleSizeThreshold = 128 * 1024; 799 800 if (cmdopt.hasWeightField) 801 { 802 reservoirSamplingViaHeap!(Yes.isWeighted)(cmdopt, outputStream); 803 } 804 else if (cmdopt.compatibilityMode || 805 (cmdopt.sampleSize < algorithmRSampleSizeThreshold && !cmdopt.preferAlgorithmR)) 806 { 807 reservoirSamplingViaHeap!(No.isWeighted)(cmdopt, outputStream); 808 } 809 else 810 { 811 reservoirSamplingAlgorithmR(cmdopt, outputStream); 812 } 813 } 814 815 /** Reservior sampling using a heap. Both weighted and unweighted random sampling are 816 * supported. 817 * 818 * The algorithm used here is based on the one-pass algorithm described by Pavlos 819 * Efraimidis and Paul Spirakis ("Weighted Random Sampling over Data Streams", Pavlos S. 820 * Efraimidis, https://arxiv.org/abs/1012.0256). In the unweighted case weights are 821 * simply set to one. 822 * 823 * The implementation uses a heap (priority queue) large enough to hold the desired 824 * number of lines. Input is read line-by-line, assigned a random value, and added to 825 * the heap. The role of the identify the lines with the highest assigned random 826 * values. Once the heap is full, adding a new line means dropping the line with the 827 * lowest score. A "min" heap used for this reason. 828 * 829 * When done reading all lines, the "min" heap is in the opposite order needed for 830 * output. The desired order is obtained by removing each element one at at time from 831 * the heap. The underlying data store will have the elements in correct order. 832 * 833 * Generating output in weighted order matters for several reasons: 834 * - For weighted sampling, it preserves the property that smaller valid subsets can be 835 * created by taking the first N lines. 836 * - For unweighted sampling, it ensures that all output permutations are possible, and 837 * are not influences by input order or the heap data structure used. 838 * - Order consistency when making repeated use of the same random seeds, but with 839 * different sample sizes. 840 * 841 * There are use cases where only the selection set matters, for these some performance 842 * could be gained by skipping the reordering and simply printing the backing store 843 * array in-order, but making this distinction seems an unnecessary complication. 844 * 845 * Notes: 846 * $(LIST 847 * * In tsv-sample versions 1.2.1 and earlier this routine also supported 848 * randomization of all input lines. This was dropped in version 1.2.2 in favor 849 * of the approach used in randomizeLines. The latter has significant advantages 850 * given that all data data must be read into memory. 851 * * For larger reservoir sizes better performance can be achieved by using 852 * reservoirSamplingAlgorithmR. See the documentation of that function for details. 853 * ) 854 */ 855 void reservoirSamplingViaHeap(Flag!"isWeighted" isWeighted, OutputRange) 856 (TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 857 if (isOutputRange!(OutputRange, char)) 858 { 859 import std.container.array; 860 import std.container.binaryheap; 861 import std.random : Random = Mt19937, uniform01; 862 import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix; 863 864 static if (isWeighted) assert(cmdopt.hasWeightField); 865 else assert(!cmdopt.hasWeightField); 866 867 assert(cmdopt.sampleSize > 0); 868 869 auto randomGenerator = Random(cmdopt.seed); 870 871 struct Entry 872 { 873 double score; 874 char[] line; 875 } 876 877 /* Create the heap and backing data store. 878 * 879 * Note: An std.container.array is used as the backing store to avoid some issues in 880 * the standard library (Phobos) binaryheap implementation. Specifically, when an 881 * std.container.array is used as backing store, the heap can efficiently reversed by 882 * removing the heap elements. This leaves the backing store in the reversed order. 883 * However, the current binaryheap implementation does not support this for all 884 * backing stores. See: https://issues.dlang.org/show_bug.cgi?id=17094. 885 */ 886 887 Array!Entry dataStore; 888 dataStore.reserve(cmdopt.sampleSize); 889 auto reservoir = dataStore.heapify!("a.score > b.score")(0); // Min binaryheap 890 891 /* Process each line. */ 892 bool headerWritten = false; 893 foreach (filename; cmdopt.files) 894 { 895 auto inputStream = (filename == "-") ? stdin : filename.File(); 896 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 897 { 898 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 899 if (fileLineNum == 1 && cmdopt.hasHeader) 900 { 901 if (!headerWritten) 902 { 903 if (cmdopt.printRandom) 904 { 905 outputStream.put(cmdopt.randomValueHeader); 906 outputStream.put(cmdopt.delim); 907 } 908 outputStream.put(line); 909 outputStream.put("\n"); 910 headerWritten = true; 911 } 912 } 913 else 914 { 915 static if (!isWeighted) 916 { 917 double lineScore = uniform01(randomGenerator); 918 } 919 else 920 { 921 double lineWeight = 922 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, filename, fileLineNum); 923 double lineScore = 924 (lineWeight > 0.0) 925 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 926 : 0.0; 927 } 928 929 if (reservoir.length < cmdopt.sampleSize) 930 { 931 reservoir.insert(Entry(lineScore, line.dup)); 932 } 933 else if (reservoir.front.score < lineScore) 934 { 935 reservoir.replaceFront(Entry(lineScore, line.dup)); 936 } 937 } 938 } 939 } 940 941 /* All entries are in the reservoir. Time to print. The heap is in reverse order 942 * of assigned weights. Reversing order is done by removing all elements from the 943 * heap, this leaves the backing store in the correct order for output. 944 * 945 * The asserts here avoid issues with the current binaryheap implementation. They 946 * detect use of backing stores having a length not synchronized to the reservoir. 947 */ 948 size_t numLines = reservoir.length; 949 assert(numLines == dataStore.length); 950 951 while (!reservoir.empty) reservoir.removeFront; 952 assert(numLines == dataStore.length); 953 954 foreach (entry; dataStore) 955 { 956 if (cmdopt.printRandom) 957 { 958 outputStream.formatRandomValue(entry.score); 959 outputStream.put(cmdopt.delim); 960 } 961 outputStream.put(entry.line); 962 outputStream.put("\n"); 963 } 964 } 965 966 /** Generates weighted random values for all input lines, preserving input order. 967 * 968 * This complements weighted reservoir sampling, but instead of using a reservoir it 969 * simply iterates over the input lines generating the values. The weighted random 970 * values are generated with the same formula used by reservoirSampling. 971 */ 972 void generateWeightedRandomValuesInorder(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 973 if (isOutputRange!(OutputRange, char)) 974 { 975 import std.random : Random = Mt19937, uniform01; 976 import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix; 977 978 assert(cmdopt.hasWeightField); 979 980 auto randomGenerator = Random(cmdopt.seed); 981 982 /* Process each line. */ 983 bool headerWritten = false; 984 size_t numLinesWritten = 0; 985 foreach (filename; cmdopt.files) 986 { 987 auto inputStream = (filename == "-") ? stdin : filename.File(); 988 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 989 { 990 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 991 if (fileLineNum == 1 && cmdopt.hasHeader) 992 { 993 if (!headerWritten) 994 { 995 outputStream.put(cmdopt.randomValueHeader); 996 outputStream.put(cmdopt.delim); 997 outputStream.put(line); 998 outputStream.put("\n"); 999 headerWritten = true; 1000 } 1001 } 1002 else 1003 { 1004 double lineWeight = getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, 1005 filename, fileLineNum); 1006 double lineScore = 1007 (lineWeight > 0.0) 1008 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 1009 : 0.0; 1010 1011 outputStream.formatRandomValue(lineScore); 1012 outputStream.put(cmdopt.delim); 1013 outputStream.put(line); 1014 outputStream.put("\n"); 1015 1016 if (cmdopt.sampleSize != 0) 1017 { 1018 ++numLinesWritten; 1019 if (numLinesWritten == cmdopt.sampleSize) return; 1020 } 1021 } 1022 } 1023 } 1024 } 1025 1026 /** Reservoir sampling via Algorithm R 1027 * 1028 * This is an implementation of reservoir sampling using what is commonly known as 1029 * "Algorithm R", credited to Alan Waterman by Donald Knuth in the "The Art of 1030 * Computer Programming, Volume 2: Seminumerical Algorithms". More information about 1031 * the algorithm can be found in Jeffrey Vitter's classic paper "Random Sampling with 1032 * a Reservoir" (1985) as well as the Wikipedia article "Reservoir Sampling" 1033 * (https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_R). 1034 * 1035 * Algorithm R is used for unweighted sampling without replacement. The heap-based 1036 * algorithm in reservoirSamplingViaHeap is used for weighted sampling. 1037 * 1038 * The classic algorithm stops after identifying the selected set of items. This 1039 * implementation goes one step further and randomizes the order of the selected 1040 * lines. This supports the tsv-sample use-case, which is line order randomization. 1041 * 1042 * This algorithm is faster than reservoirSamplingViaHeap when the sample size 1043 * (reservoir size) is large. Heap insertion is O(log k), where k is the sample size. 1044 * Insertion in this algorithm is O(1). Similarly, generating the random order in the 1045 * heap is O(k * log k), while in this algorithm the final randomization step is O(k). 1046 * 1047 * This speed advantage may be offset a certain amount by using a more expensive random 1048 * value generator. reservoirSamplingViaHeap generates values between zero and one, 1049 * whereas reservoirSamplingAlgorithR generates random integers over and ever growing 1050 * interval. The latter is expected to be more expensive. This is consistent with 1051 * performance test indicating that reservoirSamplingViaHeap is faster when using 1052 * small-to-medium size reservoirs and large input streams. 1053 */ 1054 void reservoirSamplingAlgorithmR(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1055 if (isOutputRange!(OutputRange, char)) 1056 { 1057 import std.random : Random = Mt19937, randomShuffle, uniform; 1058 import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix; 1059 1060 assert(cmdopt.sampleSize > 0); 1061 assert(!cmdopt.hasWeightField); 1062 assert(!cmdopt.compatibilityMode); 1063 assert(!cmdopt.printRandom); 1064 assert(!cmdopt.genRandomInorder); 1065 1066 string[] reservoir; 1067 auto reservoirAppender = appender(&reservoir); 1068 reservoirAppender.reserve(cmdopt.sampleSize); 1069 1070 auto randomGenerator = Random(cmdopt.seed); 1071 1072 /* Process each line. */ 1073 1074 bool headerWritten = false; 1075 size_t totalLineNum = 0; 1076 foreach (filename; cmdopt.files) 1077 { 1078 auto inputStream = (filename == "-") ? stdin : filename.File(); 1079 foreach (fileLineNum, line; inputStream.byLine(KeepTerminator.no).enumerate(1)) 1080 { 1081 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 1082 if (fileLineNum == 1 && cmdopt.hasHeader) 1083 { 1084 if (!headerWritten) 1085 { 1086 outputStream.put(line); 1087 outputStream.put("\n"); 1088 headerWritten = true; 1089 } 1090 } 1091 else 1092 { 1093 /* Add lines to the reservoir until the reservoir is filled. 1094 * After that lines are added with decreasing likelihood, based on 1095 * the total number of lines seen. If added to the reservoir, the 1096 * line replaces a randomly chosen existing line. 1097 */ 1098 if (totalLineNum < cmdopt.sampleSize) 1099 { 1100 reservoirAppender ~= line.idup; 1101 } 1102 else 1103 { 1104 size_t i = uniform(0, totalLineNum, randomGenerator); 1105 if (i < reservoir.length) reservoir[i] = line.idup; 1106 } 1107 1108 ++totalLineNum; 1109 } 1110 } 1111 } 1112 1113 /* The random sample is now in the reservior. Shuffle it and print. */ 1114 1115 reservoir.randomShuffle(randomGenerator); 1116 1117 foreach (ref line; reservoir) 1118 { 1119 outputStream.put(line); 1120 outputStream.put("\n"); 1121 } 1122 } 1123 1124 /** Invokes the appropriate routine to randomize input lines based on the command line 1125 * arguments. 1126 * 1127 * This routine selects the appropriate randomize lines function and template instantiation 1128 * to use based on the command line arguments. 1129 */ 1130 void randomizeLinesCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1131 if (isOutputRange!(OutputRange, char)) 1132 { 1133 if (cmdopt.hasWeightField) 1134 { 1135 randomizeLinesViaSort!(Yes.isWeighted)(cmdopt, outputStream); 1136 } 1137 else if (cmdopt.compatibilityMode) 1138 { 1139 randomizeLinesViaSort!(No.isWeighted)(cmdopt, outputStream); 1140 } 1141 else 1142 { 1143 randomizeLinesViaShuffle(cmdopt, outputStream); 1144 } 1145 } 1146 1147 /** Randomize all the lines in files or standard input using assigned random weights 1148 * and sorting. 1149 * 1150 * All lines in files and/or standard input are read in and written out in random 1151 * order. This algorithm assigns a random value to each line and sorts. This approach 1152 * supports both weighted sampling and simple random sampling (unweighted). 1153 * 1154 * This is significantly faster than heap-based reservoir sampling in the case where 1155 * the entire file is being read. See also randomizeLinesViaShuffle for the unweighted 1156 * case, as it is a little faster, at the cost not supporting random value printing or 1157 * compatibility-mode. 1158 * 1159 * Input data size is limited by available memory. Disk oriented techniques are needed 1160 * when data sizes are larger. For example, generating random values line-by-line (ala 1161 * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort. 1162 */ 1163 void randomizeLinesViaSort(Flag!"isWeighted" isWeighted, OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1164 if (isOutputRange!(OutputRange, char)) 1165 { 1166 import std.algorithm : map, sort; 1167 1168 static if (isWeighted) assert(cmdopt.hasWeightField); 1169 else assert(!cmdopt.hasWeightField); 1170 1171 assert(cmdopt.sampleSize == 0); 1172 1173 /* 1174 * Read all file data into memory. Then split the data into lines and assign a 1175 * random value to each line. identifyFileLines also writes the first header line. 1176 */ 1177 auto fileData = cmdopt.files.map!FileData.array; 1178 auto inputLines = fileData.identifyFileLines!(Yes.hasRandomValue, isWeighted)(cmdopt, outputStream); 1179 1180 /* 1181 * Sort by the weight and output the lines. 1182 */ 1183 inputLines.sort!((a, b) => a.randomValue > b.randomValue); 1184 1185 foreach (lineEntry; inputLines) 1186 { 1187 if (cmdopt.printRandom) 1188 { 1189 outputStream.formatRandomValue(lineEntry.randomValue); 1190 outputStream.put(cmdopt.delim); 1191 } 1192 outputStream.put(lineEntry.data); 1193 outputStream.put("\n"); 1194 } 1195 } 1196 1197 /** Randomize all the lines in files or standard input using a shuffling algorithm. 1198 * 1199 * All lines in files and/or standard input are read in and written out in random 1200 * order. This routine uses array shuffling, which is faster than sorting. This makes 1201 * this routine a good alternative to randomizeLinesViaSort when doing unweighted 1202 * randomization. 1203 * 1204 * Input data size is limited by available memory. Disk oriented techniques are needed 1205 * when data sizes are larger. For example, generating random values line-by-line (ala 1206 * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort. 1207 * 1208 * This routine does not support random value printing or compatibility-mode. 1209 */ 1210 void randomizeLinesViaShuffle(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1211 if (isOutputRange!(OutputRange, char)) 1212 { 1213 import std.algorithm : map; 1214 import std.random : Random = Mt19937, randomShuffle; 1215 1216 assert(cmdopt.sampleSize == 0); 1217 assert(!cmdopt.hasWeightField); 1218 assert(!cmdopt.printRandom); 1219 assert(!cmdopt.genRandomInorder); 1220 1221 /* 1222 * Read all file data into memory and split into lines. 1223 */ 1224 auto fileData = cmdopt.files.map!FileData.array; 1225 auto inputLines = fileData.identifyFileLines!(No.hasRandomValue, No.isWeighted)(cmdopt, outputStream); 1226 1227 /* 1228 * Randomly shuffle and print each line. 1229 * 1230 * Note: Also tried randomCover, but that was exceedingly slow. 1231 */ 1232 import std.random : randomShuffle; 1233 1234 auto randomGenerator = Random(cmdopt.seed); 1235 inputLines.randomShuffle(randomGenerator); 1236 1237 foreach (ref line; inputLines) 1238 { 1239 outputStream.put(line.data); 1240 outputStream.put("\n"); 1241 } 1242 } 1243 1244 /** Simple random sampling with replacement. 1245 * 1246 * All lines in files and/or standard input are read in. Then random lines are selected 1247 * one at a time and output. Lines can be selected multiple times. This process continues 1248 * until the desired number of samples (--n|num) has been output. Output continues 1249 * indefinitely if a sample size was not provided. 1250 */ 1251 void simpleRandomSamplingWithReplacement(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1252 if (isOutputRange!(OutputRange, char)) 1253 { 1254 import std.algorithm : map; 1255 import std.random : Random = Mt19937, uniform; 1256 1257 /* 1258 * Read all file data into memory and split the data into lines. 1259 */ 1260 auto fileData = cmdopt.files.map!FileData.array; 1261 auto inputLines = fileData.identifyFileLines!(No.hasRandomValue, No.isWeighted)(cmdopt, outputStream); 1262 1263 if (inputLines.length > 0) 1264 { 1265 auto randomGenerator = Random(cmdopt.seed); 1266 1267 /* Repeat forever is sampleSize is zero, otherwise print sampleSize lines. */ 1268 size_t numLeft = (cmdopt.sampleSize == 0) ? 1 : cmdopt.sampleSize; 1269 while (numLeft != 0) 1270 { 1271 size_t index = uniform(0, inputLines.length, randomGenerator); 1272 outputStream.put(inputLines[index].data); 1273 outputStream.put("\n"); 1274 if (cmdopt.sampleSize != 0) numLeft--; 1275 } 1276 } 1277 } 1278 1279 /** A container and reader data form a file or standard input. 1280 * 1281 * The FileData struct is used to read data from a file or standard input. It is used 1282 * by passing a filename to the constructor. The constructor reads the file data. 1283 * If the filename is a single hyphen ('-') then data is read from standard input. 1284 * 1285 * The struct make the data available through two members: 'filename', which is the 1286 * filename, and 'data', which is a character array of the data. 1287 */ 1288 struct FileData 1289 { 1290 string filename; 1291 char[] data; 1292 1293 this(string fname) 1294 { 1295 import std.algorithm : min; 1296 import std.array : appender; 1297 1298 filename = fname; 1299 1300 ubyte[1024 * 128] fileRawBuf; 1301 auto dataAppender = appender(&data); 1302 auto ifile = (filename == "-") ? stdin : filename.File; 1303 1304 if (filename != "-") 1305 { 1306 ulong filesize = ifile.size; 1307 if (filesize < ulong.max) dataAppender.reserve(min(filesize, size_t.max)); 1308 } 1309 1310 foreach (ref ubyte[] buffer; ifile.byChunk(fileRawBuf)) dataAppender.put(cast(char[]) buffer); 1311 } 1312 } 1313 1314 /** HasRandomValue is a boolean flag used at compile time by identifyFileLines to 1315 * distinguish use cases needing random value assignments from those that don't. 1316 */ 1317 alias HasRandomValue = Flag!"hasRandomValue"; 1318 1319 /** An InputLine array is returned by identifyFileLines to represent each non-header line 1320 * line found in a FileData array. The 'data' element contains the line. A 'randomValue' 1321 * line is included if random values are being generated. 1322 */ 1323 struct InputLine(HasRandomValue hasRandomValue) 1324 { 1325 char[] data; 1326 static if (hasRandomValue) double randomValue; 1327 } 1328 1329 /** identifyFileLines is used by algorithms that read all files into memory prior to 1330 * processing. It does the initial processing of the file data. 1331 * 1332 * Three primary tasks are performed. One is splitting all input data into lines. The 1333 * second is writting the header line from the first file to the output stream. Header 1334 * lines from subsequent files are ignored. Third is assigning a random value to the 1335 * line, if random values are being generated. 1336 * 1337 * The key input is a FileData array, one element for each file. The FileData reads 1338 * the file when instantiated. 1339 * 1340 * The return value is an array of InputLine structs. The struct will have a 'randomValue' 1341 * member if random values are being assigned. 1342 */ 1343 InputLine!hasRandomValue[] identifyFileLines(HasRandomValue hasRandomValue, Flag!"isWeighted" isWeighted, OutputRange) 1344 (ref FileData[] fileData, TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1345 if (isOutputRange!(OutputRange, char)) 1346 { 1347 import std.algorithm : splitter; 1348 import std.array : appender; 1349 import std.random : Random = Mt19937, uniform01; 1350 import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix; 1351 1352 static assert(hasRandomValue || !isWeighted); 1353 static if(!hasRandomValue) assert(!cmdopt.printRandom); 1354 1355 InputLine!hasRandomValue[] inputLines; 1356 1357 auto linesAppender = appender(&inputLines); 1358 static if (hasRandomValue) auto randomGenerator = Random(cmdopt.seed); 1359 bool headerWritten = false; 1360 1361 foreach (fd; fileData) 1362 { 1363 /* Drop the last newline to avoid adding an extra empty line. */ 1364 auto data = (fd.data.length > 0 && fd.data[$ - 1] == '\n') ? fd.data[0 .. $ - 1] : fd.data; 1365 foreach (fileLineNum, ref line; data.splitter('\n').enumerate(1)) 1366 { 1367 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, fd.filename, fileLineNum); 1368 if (fileLineNum == 1 && cmdopt.hasHeader) 1369 { 1370 if (!headerWritten) 1371 { 1372 if (cmdopt.printRandom) 1373 { 1374 outputStream.put(cmdopt.randomValueHeader); 1375 outputStream.put(cmdopt.delim); 1376 } 1377 outputStream.put(line); 1378 outputStream.put("\n"); 1379 headerWritten = true; 1380 } 1381 } 1382 else 1383 { 1384 static if (!hasRandomValue) 1385 { 1386 linesAppender.put(InputLine!hasRandomValue(line)); 1387 } 1388 else 1389 { 1390 static if (!isWeighted) 1391 { 1392 double randomValue = uniform01(randomGenerator); 1393 } 1394 else 1395 { 1396 double lineWeight = 1397 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, 1398 fd.filename, fileLineNum); 1399 double randomValue = 1400 (lineWeight > 0.0) 1401 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 1402 : 0.0; 1403 } 1404 1405 linesAppender.put(InputLine!hasRandomValue(line, randomValue)); 1406 } 1407 } 1408 } 1409 } 1410 1411 return inputLines; 1412 } 1413 1414 /** Write a floating point random value to an output stream. 1415 * 1416 * This routine is used for floating point random value printing. This routine writes 1417 * 17 significant digits, the range available in doubles. This routine prefers decimal 1418 * format, without exponents. It will generate somewhat large precision numbers, 1419 * currently up to 28 digits, before switching to exponents. 1420 * 1421 * The primary reason for this approach is to enable faster sorting on random values 1422 * by GNU sort and similar external sorting programs. GNU sort is dramatically faster 1423 * on decimal format numeric sorts ('n' switch) than general numeric sorts ('g' switch). 1424 * The 'general numeric' handles exponential notation. The difference is 5-10x. 1425 * 1426 * Random values generated by Bernoulli sampling are nearly always greater than 1e-12. 1427 * No examples less than 1e-09 were seen in hundred of millions of trials. Similar 1428 * results were seen with weighted sampling with integer weights. The same is not true 1429 * with floating point weights. These produce quite large exponents. However, even 1430 * for floating point weights this can be useful. For random weights [0,1] less than 5% 1431 * will be less than 1e-12 and use exponential notation. 1432 */ 1433 void formatRandomValue(OutputRange)(auto ref OutputRange outputStream, double value) 1434 if (isOutputRange!(OutputRange, char)) 1435 { 1436 import std.format : formatValue, singleSpec; 1437 1438 immutable spec17f = singleSpec("%.17f"); 1439 immutable spec18f = singleSpec("%.18f"); 1440 immutable spec19f = singleSpec("%.19f"); 1441 immutable spec20f = singleSpec("%.20f"); 1442 immutable spec21f = singleSpec("%.21f"); 1443 immutable spec22f = singleSpec("%.22f"); 1444 immutable spec23f = singleSpec("%.23f"); 1445 immutable spec24f = singleSpec("%.24f"); 1446 immutable spec25f = singleSpec("%.25f"); 1447 immutable spec26f = singleSpec("%.26f"); 1448 immutable spec27f = singleSpec("%.27f"); 1449 immutable spec28f = singleSpec("%.28f"); 1450 1451 immutable spec17g = singleSpec("%.17g"); 1452 1453 auto formatSpec = 1454 (value >= 1e-01) ? spec17f : 1455 (value >= 1e-02) ? spec18f : 1456 (value >= 1e-03) ? spec19f : 1457 (value >= 1e-04) ? spec20f : 1458 (value >= 1e-05) ? spec21f : 1459 (value >= 1e-06) ? spec22f : 1460 (value >= 1e-07) ? spec23f : 1461 (value >= 1e-08) ? spec24f : 1462 (value >= 1e-09) ? spec25f : 1463 (value >= 1e-10) ? spec26f : 1464 (value >= 1e-11) ? spec27f : 1465 (value >= 1e-12) ? spec28f : spec17g; 1466 1467 outputStream.formatValue(value, formatSpec); 1468 } 1469 1470 unittest 1471 { 1472 void testFormatValue(double value, string expected) 1473 { 1474 import std.array : appender; 1475 import std.format : format; 1476 1477 auto s = appender!string(); 1478 s.formatRandomValue(value); 1479 assert(s.data == expected, 1480 format("[testFormatValue] value: %g; expected: %s; actual: %s", value, expected, s.data)); 1481 } 1482 1483 testFormatValue(1.0, "1.00000000000000000"); 1484 testFormatValue(0.1, "0.10000000000000001"); 1485 testFormatValue(0.01, "0.010000000000000000"); 1486 testFormatValue(1e-03, "0.0010000000000000000"); 1487 testFormatValue(1e-04, "0.00010000000000000000"); 1488 testFormatValue(1e-05, "0.000010000000000000001"); 1489 testFormatValue(1e-06, "0.0000010000000000000000"); 1490 testFormatValue(1e-07, "0.00000010000000000000000"); 1491 testFormatValue(1e-08, "0.000000010000000000000000"); 1492 testFormatValue(1e-09, "0.0000000010000000000000001"); 1493 testFormatValue(1e-10, "0.00000000010000000000000000"); 1494 testFormatValue(1e-11, "0.000000000009999999999999999"); 1495 testFormatValue(1e-12, "0.0000000000010000000000000000"); 1496 testFormatValue(1e-13, "1e-13"); 1497 testFormatValue(1e-14, "1e-14"); 1498 testFormatValue(12345678901234567e-15, "12.34567890123456735"); 1499 testFormatValue(12345678901234567e-16, "1.23456789012345669"); 1500 testFormatValue(12345678901234567e-17, "0.12345678901234566"); 1501 testFormatValue(12345678901234567e-18, "0.012345678901234567"); 1502 testFormatValue(12345678901234567e-19, "0.0012345678901234567"); 1503 testFormatValue(12345678901234567e-20, "0.00012345678901234567"); 1504 testFormatValue(12345678901234567e-21, "0.000012345678901234568"); 1505 testFormatValue(12345678901234567e-22, "0.0000012345678901234567"); 1506 testFormatValue(12345678901234567e-23, "0.00000012345678901234566"); 1507 testFormatValue(12345678901234567e-24, "0.000000012345678901234567"); 1508 testFormatValue(12345678901234567e-25, "0.0000000012345678901234566"); 1509 testFormatValue(12345678901234567e-26, "0.00000000012345678901234568"); 1510 testFormatValue(12345678901234567e-27, "0.000000000012345678901234567"); 1511 testFormatValue(12345678901234567e-28, "0.0000000000012345678901234567"); 1512 testFormatValue(12345678901234567e-29, "1.2345678901234566e-13"); 1513 } 1514 1515 1516 /** Convenience function for extracting a single field from a line. See 1517 * [tsv_utils.common.utils.getTsvFieldValue] for details. This wrapper creates error 1518 * text tailored for this program. 1519 */ 1520 import std.traits : isSomeChar; 1521 T getFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim, string filename, size_t lineNum) pure @safe 1522 if (isSomeChar!C) 1523 { 1524 import std.conv : ConvException, to; 1525 import std.format : format; 1526 import tsv_utils.common.utils : getTsvFieldValue; 1527 1528 T val; 1529 try 1530 { 1531 val = getTsvFieldValue!T(line, fieldIndex, delim); 1532 } 1533 catch (ConvException exc) 1534 { 1535 throw new Exception( 1536 format("Could not process line: %s\n File: %s Line: %s%s", 1537 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum, 1538 (lineNum == 1) ? "\n Is this a header line? Use --H|header to skip." : "")); 1539 } 1540 catch (Exception exc) 1541 { 1542 /* Not enough fields on the line. */ 1543 throw new Exception( 1544 format("Could not process line: %s\n File: %s Line: %s", 1545 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum)); 1546 } 1547 1548 return val; 1549 } 1550 1551 unittest 1552 { 1553 /* getFieldValue unit tests. getTsvFieldValue has it's own tests. 1554 * These tests make basic sanity checks on the getFieldValue wrapper. 1555 */ 1556 import std.exception; 1557 1558 assert(getFieldValue!double("123", 0, '\t', "unittest", 1) == 123); 1559 assert(getFieldValue!double("123.4", 0, '\t', "unittest", 1) == 123.4); 1560 assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 1)); 1561 assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 2)); 1562 assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 1)); 1563 assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 2)); 1564 } 1565 1566 /* Unit tests for the main program start here. 1567 * 1568 * Portability note: Many of the tests here rely on generating consistent random numbers 1569 * across different platforms when using the same random seed. So far this has succeeded 1570 * on several different platorm, compiler, and library versions. However, it is certainly 1571 * possible this condition will not hold on other platforms. 1572 * 1573 * For tsv-sample, this portability implies generating the same results on different 1574 * platforms when using the same random seed. This is NOT part of tsv-sample guarantees, 1575 * but it is convenient for testing. If platforms are identified that do not generate 1576 * the same results these tests will need to be adjusted. 1577 */ 1578 version(unittest) 1579 { 1580 /* Unit test helper functions. */ 1581 1582 import tsv_utils.common.unittest_utils; // tsv unit test helpers, from common/src/. 1583 import std.conv : to; 1584 1585 void testTsvSample(string[] cmdArgs, string[][] expected) 1586 { 1587 import std.array : appender; 1588 import std.format : format; 1589 1590 assert(cmdArgs.length > 0, "[testTsvSample] cmdArgs must not be empty."); 1591 1592 auto formatAssertMessage(T...)(string msg, T formatArgs) 1593 { 1594 auto formatString = "[testTsvSample] %s: " ~ msg; 1595 return format(formatString, cmdArgs[0], formatArgs); 1596 } 1597 1598 TsvSampleOptions cmdopt; 1599 auto savedCmdArgs = cmdArgs.to!string; 1600 auto r = cmdopt.processArgs(cmdArgs); 1601 assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); 1602 auto output = appender!(char[])(); 1603 1604 tsvSample(cmdopt, output); // This invokes the main code line. 1605 1606 auto expectedOutput = expected.tsvDataToString; 1607 1608 assert(output.data == expectedOutput, 1609 formatAssertMessage( 1610 "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================", 1611 expectedOutput.to!string, output.data.to!string)); 1612 } 1613 } 1614 1615 unittest 1616 { 1617 import std.path : buildPath; 1618 import std.file : rmdirRecurse; 1619 import std.format : format; 1620 1621 auto testDir = makeUnittestTempDir("tsv_sample"); 1622 scope(exit) testDir.rmdirRecurse; 1623 1624 /* Tabular data sets and expected results use the built-in static seed. 1625 * Tests are run by writing the data set to a file, then calling the main 1626 * routine to process. The function testTsvSample plays the role of the 1627 * main program. Rather than writing to expected output, the results are 1628 * matched against expected. The expected results were verified by hand 1629 * prior to inclusion in the test. 1630 * 1631 * The initial part of this section is simply setting up data files and 1632 * expected results. 1633 * 1634 * Expected results naming conventions: 1635 * - Prefix: dataNxMExpected. N and M are numbers. e.g. data3x6Expected 1636 * - Sampling Type (required): Permute, Replace, Bernoulli, Distinct 1637 * - Compatibility: Compat, AlgoR, Skip, Swap 1638 * - Weight Field: Wt<num>, e.g. Wt3 1639 * - Sample Size: Num<num>, eg. Num3 1640 * - Seed Value: V<num>, eg. V77 1641 * - Key Field: K<num>, e.g. K2 1642 * - Probability: P<num>, e.g P05 (5%) 1643 * - Printing Probalities: Probs 1644 * - Printing Probs in order: ProbsInorder 1645 * - Printing Probs with custom header: RVCustom 1646 */ 1647 1648 /* Empty file. */ 1649 string[][] dataEmpty = []; 1650 string fpath_dataEmpty = buildPath(testDir, "dataEmpty.tsv"); 1651 writeUnittestTsvFile(fpath_dataEmpty, dataEmpty); 1652 1653 /* 3x1, header only. */ 1654 string[][] data3x0 = [["field_a", "field_b", "field_c"]]; 1655 string fpath_data3x0 = buildPath(testDir, "data3x0.tsv"); 1656 writeUnittestTsvFile(fpath_data3x0, data3x0); 1657 1658 /* 3x1 */ 1659 string[][] data3x1 = 1660 [["field_a", "field_b", "field_c"], 1661 ["tan", "タン", "8.5"]]; 1662 1663 string fpath_data3x1 = buildPath(testDir, "data3x1.tsv"); 1664 string fpath_data3x1_noheader = buildPath(testDir, "data3x1_noheader.tsv"); 1665 writeUnittestTsvFile(fpath_data3x1, data3x1); 1666 writeUnittestTsvFile(fpath_data3x1_noheader, data3x1[1..$]); 1667 1668 string[][] data3x1ExpectedReplaceNum3 = 1669 [["field_a", "field_b", "field_c"], 1670 ["tan", "タン", "8.5"], 1671 ["tan", "タン", "8.5"], 1672 ["tan", "タン", "8.5"]]; 1673 1674 /* 3x2 */ 1675 string[][] data3x2 = 1676 [["field_a", "field_b", "field_c"], 1677 ["brown", "褐色", "29.2"], 1678 ["gray", "グレー", "6.2"]]; 1679 1680 string fpath_data3x2 = buildPath(testDir, "data3x2.tsv"); 1681 string fpath_data3x2_noheader = buildPath(testDir, "data3x2_noheader.tsv"); 1682 writeUnittestTsvFile(fpath_data3x2, data3x2); 1683 writeUnittestTsvFile(fpath_data3x2_noheader, data3x2[1..$]); 1684 1685 string[][] data3x2PermuteCompat = 1686 [["field_a", "field_b", "field_c"], 1687 ["gray", "グレー", "6.2"], 1688 ["brown", "褐色", "29.2"]]; 1689 1690 string[][] data3x2PermuteShuffle = 1691 [["field_a", "field_b", "field_c"], 1692 ["gray", "グレー", "6.2"], 1693 ["brown", "褐色", "29.2"]]; 1694 1695 /* 3x3 */ 1696 string[][] data3x3 = 1697 [["field_a", "field_b", "field_c"], 1698 ["orange", "オレンジ", "2.5"], 1699 ["pink", "ピンク", "1.1"], 1700 ["purple", "紫の", "42"]]; 1701 1702 string fpath_data3x3 = buildPath(testDir, "data3x3.tsv"); 1703 string fpath_data3x3_noheader = buildPath(testDir, "data3x3_noheader.tsv"); 1704 writeUnittestTsvFile(fpath_data3x3, data3x3); 1705 writeUnittestTsvFile(fpath_data3x3_noheader, data3x3[1..$]); 1706 1707 string[][] data3x3ExpectedPermuteCompat = 1708 [["field_a", "field_b", "field_c"], 1709 ["purple", "紫の", "42"], 1710 ["pink", "ピンク", "1.1"], 1711 ["orange", "オレンジ", "2.5"]]; 1712 1713 string[][] data3x3ExpectedPermuteSwap = 1714 [["field_a", "field_b", "field_c"], 1715 ["purple", "紫の", "42"], 1716 ["orange", "オレンジ", "2.5"], 1717 ["pink", "ピンク", "1.1"]]; 1718 1719 /* 3x6 */ 1720 string[][] data3x6 = 1721 [["field_a", "field_b", "field_c"], 1722 ["red", "赤", "23.8"], 1723 ["green", "緑", "0.0072"], 1724 ["white", "白", "1.65"], 1725 ["yellow", "黄", "12"], 1726 ["blue", "青", "12"], 1727 ["black", "黒", "0.983"]]; 1728 string fpath_data3x6 = buildPath(testDir, "data3x6.tsv"); 1729 string fpath_data3x6_noheader = buildPath(testDir, "data3x6_noheader.tsv"); 1730 writeUnittestTsvFile(fpath_data3x6, data3x6); 1731 writeUnittestTsvFile(fpath_data3x6_noheader, data3x6[1..$]); 1732 1733 // Randomization, all lines 1734 string[][] data3x6ExpectedPermuteCompat = 1735 [["field_a", "field_b", "field_c"], 1736 ["yellow", "黄", "12"], 1737 ["black", "黒", "0.983"], 1738 ["blue", "青", "12"], 1739 ["white", "白", "1.65"], 1740 ["green", "緑", "0.0072"], 1741 ["red", "赤", "23.8"]]; 1742 1743 string[][] data3x6ExpectedPermuteSwap = 1744 [["field_a", "field_b", "field_c"], 1745 ["black", "黒", "0.983"], 1746 ["green", "緑", "0.0072"], 1747 ["red", "赤", "23.8"], 1748 ["yellow", "黄", "12"], 1749 ["white", "白", "1.65"], 1750 ["blue", "青", "12"]]; 1751 1752 string[][] data3x6ExpectedPermuteCompatProbs = 1753 [["random_value", "field_a", "field_b", "field_c"], 1754 ["0.96055546286515892", "yellow", "黄", "12"], 1755 ["0.75710153928957880", "black", "黒", "0.983"], 1756 ["0.52525980887003243", "blue", "青", "12"], 1757 ["0.49287854949943721", "white", "白", "1.65"], 1758 ["0.15929344086907804", "green", "緑", "0.0072"], 1759 ["0.010968807619065046", "red", "赤", "23.8"]]; 1760 1761 /* Note: data3x6ExpectedAlgoRNum6 is identical to data3x6ExpectedPermuteSwap because 1762 * both are effectively the same algorithm given that --num is data length. Both read 1763 * in the full data in order then call randomShuffle. 1764 */ 1765 string[][] data3x6ExpectedPermuteAlgoRNum6 = 1766 [["field_a", "field_b", "field_c"], 1767 ["black", "黒", "0.983"], 1768 ["green", "緑", "0.0072"], 1769 ["red", "赤", "23.8"], 1770 ["yellow", "黄", "12"], 1771 ["white", "白", "1.65"], 1772 ["blue", "青", "12"]]; 1773 1774 string[][] data3x6ExpectedPermuteAlgoRNum5 = 1775 [["field_a", "field_b", "field_c"], 1776 ["red", "赤", "23.8"], 1777 ["black", "黒", "0.983"], 1778 ["white", "白", "1.65"], 1779 ["green", "緑", "0.0072"], 1780 ["yellow", "黄", "12"]]; 1781 1782 string[][] data3x6ExpectedPermuteAlgoRNum4 = 1783 [["field_a", "field_b", "field_c"], 1784 ["blue", "青", "12"], 1785 ["green", "緑", "0.0072"], 1786 ["black", "黒", "0.983"], 1787 ["white", "白", "1.65"]]; 1788 1789 string[][] data3x6ExpectedPermuteAlgoRNum3 = 1790 [["field_a", "field_b", "field_c"], 1791 ["red", "赤", "23.8"], 1792 ["black", "黒", "0.983"], 1793 ["green", "緑", "0.0072"]]; 1794 1795 string[][] data3x6ExpectedPermuteAlgoRNum2 = 1796 [["field_a", "field_b", "field_c"], 1797 ["black", "黒", "0.983"], 1798 ["red", "赤", "23.8"]]; 1799 1800 string[][] data3x6ExpectedPermuteAlgoRNum1 = 1801 [["field_a", "field_b", "field_c"], 1802 ["green", "緑", "0.0072"]]; 1803 1804 string[][] data3x6ExpectedBernoulliProbsP100 = 1805 [["random_value", "field_a", "field_b", "field_c"], 1806 ["0.010968807619065046", "red", "赤", "23.8"], 1807 ["0.15929344086907804", "green", "緑", "0.0072"], 1808 ["0.49287854949943721", "white", "白", "1.65"], 1809 ["0.96055546286515892", "yellow", "黄", "12"], 1810 ["0.52525980887003243", "blue", "青", "12"], 1811 ["0.75710153928957880", "black", "黒", "0.983"]]; 1812 1813 string[][] data3x6ExpectedBernoulliCompatProbsP60 = 1814 [["random_value", "field_a", "field_b", "field_c"], 1815 ["0.010968807619065046", "red", "赤", "23.8"], 1816 ["0.15929344086907804", "green", "緑", "0.0072"], 1817 ["0.49287854949943721", "white", "白", "1.65"], 1818 ["0.52525980887003243", "blue", "青", "12"]]; 1819 1820 string[][] data3x6ExpectedBernoulliSkipP40 = 1821 [["field_a", "field_b", "field_c"], 1822 ["red", "赤", "23.8"], 1823 ["green", "緑", "0.0072"], 1824 ["yellow", "黄", "12"]]; 1825 1826 string[][] data3x6ExpectedBernoulliCompatP60 = 1827 [["field_a", "field_b", "field_c"], 1828 ["red", "赤", "23.8"], 1829 ["green", "緑", "0.0072"], 1830 ["white", "白", "1.65"], 1831 ["blue", "青", "12"]]; 1832 1833 string[][] data3x6ExpectedDistinctK1K3P60 = 1834 [["field_a", "field_b", "field_c"], 1835 ["green", "緑", "0.0072"], 1836 ["white", "白", "1.65"], 1837 ["blue", "青", "12"]]; 1838 1839 string[][] data3x6ExpectedDistinctK1K3P60Probs = 1840 [["random_value", "field_a", "field_b", "field_c"], 1841 ["0", "green", "緑", "0.0072"], 1842 ["0", "white", "白", "1.65"], 1843 ["0", "blue", "青", "12"]]; 1844 1845 string[][] data3x6ExpectedDistinctK1K3P60ProbsRVCustom = 1846 [["custom_random_value_header", "field_a", "field_b", "field_c"], 1847 ["0", "green", "緑", "0.0072"], 1848 ["0", "white", "白", "1.65"], 1849 ["0", "blue", "青", "12"]]; 1850 1851 string[][] data3x6ExpectedDistinctK2P2ProbsInorder = 1852 [["random_value", "field_a", "field_b", "field_c"], 1853 ["1", "red", "赤", "23.8"], 1854 ["0", "green", "緑", "0.0072"], 1855 ["0", "white", "白", "1.65"], 1856 ["1", "yellow", "黄", "12"], 1857 ["3", "blue", "青", "12"], 1858 ["2", "black", "黒", "0.983"]]; 1859 1860 string[][] data3x6ExpectedPermuteWt3Probs = 1861 [["random_value", "field_a", "field_b", "field_c"], 1862 ["0.99665198757645390", "yellow", "黄", "12"], 1863 ["0.94775884809836686", "blue", "青", "12"], 1864 ["0.82728234682286661", "red", "赤", "23.8"], 1865 ["0.75346697377181959", "black", "黒", "0.983"], 1866 ["0.65130103496422487", "white", "白", "1.65"], 1867 ["1.5636943712879866e-111", "green", "緑", "0.0072"]]; 1868 1869 string[][] data3x6ExpectedWt3ProbsInorder = 1870 [["random_value", "field_a", "field_b", "field_c"], 1871 ["0.82728234682286661", "red", "赤", "23.8"], 1872 ["1.5636943712879866e-111", "green", "緑", "0.0072"], 1873 ["0.65130103496422487", "white", "白", "1.65"], 1874 ["0.99665198757645390", "yellow", "黄", "12"], 1875 ["0.94775884809836686", "blue", "青", "12"], 1876 ["0.75346697377181959", "black", "黒", "0.983"]]; 1877 1878 string[][] data3x6ExpectedPermuteWt3 = 1879 [["field_a", "field_b", "field_c"], 1880 ["yellow", "黄", "12"], 1881 ["blue", "青", "12"], 1882 ["red", "赤", "23.8"], 1883 ["black", "黒", "0.983"], 1884 ["white", "白", "1.65"], 1885 ["green", "緑", "0.0072"]]; 1886 1887 string[][] data3x6ExpectedReplaceNum10 = 1888 [["field_a", "field_b", "field_c"], 1889 ["black", "黒", "0.983"], 1890 ["green", "緑", "0.0072"], 1891 ["green", "緑", "0.0072"], 1892 ["red", "赤", "23.8"], 1893 ["yellow", "黄", "12"], 1894 ["red", "赤", "23.8"], 1895 ["white", "白", "1.65"], 1896 ["yellow", "黄", "12"], 1897 ["yellow", "黄", "12"], 1898 ["white", "白", "1.65"], 1899 ]; 1900 1901 string[][] data3x6ExpectedReplaceNum10V77 = 1902 [["field_a", "field_b", "field_c"], 1903 ["black", "黒", "0.983"], 1904 ["red", "赤", "23.8"], 1905 ["black", "黒", "0.983"], 1906 ["yellow", "黄", "12"], 1907 ["green", "緑", "0.0072"], 1908 ["green", "緑", "0.0072"], 1909 ["green", "緑", "0.0072"], 1910 ["yellow", "黄", "12"], 1911 ["blue", "青", "12"], 1912 ["white", "白", "1.65"], 1913 ]; 1914 1915 /* Using a different static seed. */ 1916 string[][] data3x6ExpectedPermuteCompatV41Probs = 1917 [["random_value", "field_a", "field_b", "field_c"], 1918 ["0.68057272653095424", "green", "緑", "0.0072"], 1919 ["0.67681624367833138", "blue", "青", "12"], 1920 ["0.32097338931635022", "yellow", "黄", "12"], 1921 ["0.25092361867427826", "red", "赤", "23.8"], 1922 ["0.15535934292711318", "black", "黒", "0.983"], 1923 ["0.046095821075141430", "white", "白", "1.65"]]; 1924 1925 string[][] data3x6ExpectedBernoulliCompatP60V41Probs = 1926 [["random_value", "field_a", "field_b", "field_c"], 1927 ["0.25092361867427826", "red", "赤", "23.8"], 1928 ["0.046095821075141430", "white", "白", "1.65"], 1929 ["0.32097338931635022", "yellow", "黄", "12"], 1930 ["0.15535934292711318", "black", "黒", "0.983"]]; 1931 1932 string[][] data3x6ExpectedPermuteWt3V41Probs = 1933 [["random_value", "field_a", "field_b", "field_c"], 1934 ["0.96799377498910666", "blue", "青", "12"], 1935 ["0.94356245792573568", "red", "赤", "23.8"], 1936 ["0.90964601024271996", "yellow", "黄", "12"], 1937 ["0.15491658409260103", "white", "白", "1.65"], 1938 ["0.15043620392537033", "black", "黒", "0.983"], 1939 ["6.1394674830701461e-24", "green", "緑", "0.0072"]]; 1940 1941 string[][] data3x6ExpectedWt3V41ProbsInorder = 1942 [["random_value", "field_a", "field_b", "field_c"], 1943 ["0.94356245792573568", "red", "赤", "23.8"], 1944 ["6.1394674830701461e-24", "green", "緑", "0.0072"], 1945 ["0.15491658409260103", "white", "白", "1.65"], 1946 ["0.90964601024271996", "yellow", "黄", "12"], 1947 ["0.96799377498910666", "blue", "青", "12"], 1948 ["0.15043620392537033", "black", "黒", "0.983"]]; 1949 1950 1951 /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */ 1952 string[][] combo1ExpectedPermuteCompat = 1953 [["field_a", "field_b", "field_c"], 1954 ["yellow", "黄", "12"], 1955 ["tan", "タン", "8.5"], 1956 ["brown", "褐色", "29.2"], 1957 ["green", "緑", "0.0072"], 1958 ["red", "赤", "23.8"], 1959 ["purple", "紫の", "42"], 1960 ["black", "黒", "0.983"], 1961 ["white", "白", "1.65"], 1962 ["gray", "グレー", "6.2"], 1963 ["blue", "青", "12"], 1964 ["pink", "ピンク", "1.1"], 1965 ["orange", "オレンジ", "2.5"]]; 1966 1967 string[][] combo1ExpectedPermuteCompatProbs = 1968 [["random_value", "field_a", "field_b", "field_c"], 1969 ["0.97088520275428891", "yellow", "黄", "12"], 1970 ["0.96055546286515892", "tan", "タン", "8.5"], 1971 ["0.81756894313730299", "brown", "褐色", "29.2"], 1972 ["0.75710153928957880", "green", "緑", "0.0072"], 1973 ["0.52525980887003243", "red", "赤", "23.8"], 1974 ["0.49287854949943721", "purple", "紫の", "42"], 1975 ["0.47081507067196071", "black", "黒", "0.983"], 1976 ["0.38388182921335101", "white", "白", "1.65"], 1977 ["0.29215990612283349", "gray", "グレー", "6.2"], 1978 ["0.24033216014504433", "blue", "青", "12"], 1979 ["0.15929344086907804", "pink", "ピンク", "1.1"], 1980 ["0.010968807619065046", "orange", "オレンジ", "2.5"]]; 1981 1982 /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */ 1983 string[][] combo1ExpectedProbsInorder = 1984 [["random_value", "field_a", "field_b", "field_c"], 1985 ["0.010968807619065046", "orange", "オレンジ", "2.5"], 1986 ["0.15929344086907804", "pink", "ピンク", "1.1"], 1987 ["0.49287854949943721", "purple", "紫の", "42"], 1988 ["0.96055546286515892", "tan", "タン", "8.5"], 1989 ["0.52525980887003243", "red", "赤", "23.8"], 1990 ["0.75710153928957880", "green", "緑", "0.0072"], 1991 ["0.38388182921335101", "white", "白", "1.65"], 1992 ["0.97088520275428891", "yellow", "黄", "12"], 1993 ["0.24033216014504433", "blue", "青", "12"], 1994 ["0.47081507067196071", "black", "黒", "0.983"], 1995 ["0.81756894313730299", "brown", "褐色", "29.2"], 1996 ["0.29215990612283349", "gray", "グレー", "6.2"]]; 1997 1998 string[][] combo1ExpectedBernoulliCompatP50Probs = 1999 [["random_value", "field_a", "field_b", "field_c"], 2000 ["0.010968807619065046", "orange", "オレンジ", "2.5"], 2001 ["0.15929344086907804", "pink", "ピンク", "1.1"], 2002 ["0.49287854949943721", "purple", "紫の", "42"], 2003 ["0.38388182921335101", "white", "白", "1.65"], 2004 ["0.24033216014504433", "blue", "青", "12"], 2005 ["0.47081507067196071", "black", "黒", "0.983"], 2006 ["0.29215990612283349", "gray", "グレー", "6.2"]]; 2007 2008 string[][] combo1ExpectedBernoulliCompatP40 = 2009 [["field_a", "field_b", "field_c"], 2010 ["orange", "オレンジ", "2.5"], 2011 ["pink", "ピンク", "1.1"], 2012 ["white", "白", "1.65"], 2013 ["blue", "青", "12"], 2014 ["gray", "グレー", "6.2"]]; 2015 2016 string[][] combo1ExpectedDistinctK1P40 = 2017 [["field_a", "field_b", "field_c"], 2018 ["orange", "オレンジ", "2.5"], 2019 ["red", "赤", "23.8"], 2020 ["green", "緑", "0.0072"], 2021 ["blue", "青", "12"], 2022 ["black", "黒", "0.983"]]; 2023 2024 string[][] combo1ExpectedPermuteWt3Probs = 2025 [["random_value", "field_a", "field_b", "field_c"], 2026 ["0.99754077523718754", "yellow", "黄", "12"], 2027 ["0.99527665440088786", "tan", "タン", "8.5"], 2028 ["0.99312578945741659", "brown", "褐色", "29.2"], 2029 ["0.98329602553389361", "purple", "紫の", "42"], 2030 ["0.97330961938083660", "red", "赤", "23.8"], 2031 ["0.88797551521739648", "blue", "青", "12"], 2032 ["0.81999230489041786", "gray", "グレー", "6.2"], 2033 ["0.55975569204250941", "white", "白", "1.65"], 2034 ["0.46472135609205739", "black", "黒", "0.983"], 2035 ["0.18824582704191337", "pink", "ピンク", "1.1"], 2036 ["0.16446131853299920", "orange", "オレンジ", "2.5"], 2037 ["1.6438086931020549e-17", "green", "緑", "0.0072"]]; 2038 2039 string[][] combo1ExpectedPermuteWt3 = 2040 [["field_a", "field_b", "field_c"], 2041 ["yellow", "黄", "12"], 2042 ["tan", "タン", "8.5"], 2043 ["brown", "褐色", "29.2"], 2044 ["purple", "紫の", "42"], 2045 ["red", "赤", "23.8"], 2046 ["blue", "青", "12"], 2047 ["gray", "グレー", "6.2"], 2048 ["white", "白", "1.65"], 2049 ["black", "黒", "0.983"], 2050 ["pink", "ピンク", "1.1"], 2051 ["orange", "オレンジ", "2.5"], 2052 ["green", "緑", "0.0072"]]; 2053 2054 string[][] combo1ExpectedPermuteAlgoRNum4 = 2055 [["field_a", "field_b", "field_c"], 2056 ["blue", "青", "12"], 2057 ["gray", "グレー", "6.2"], 2058 ["brown", "褐色", "29.2"], 2059 ["white", "白", "1.65"]]; 2060 2061 string[][] combo1ExpectedReplaceNum10 = 2062 [["field_a", "field_b", "field_c"], 2063 ["gray", "グレー", "6.2"], 2064 ["yellow", "黄", "12"], 2065 ["yellow", "黄", "12"], 2066 ["white", "白", "1.65"], 2067 ["tan", "タン", "8.5"], 2068 ["white", "白", "1.65"], 2069 ["blue", "青", "12"], 2070 ["black", "黒", "0.983"], 2071 ["tan", "タン", "8.5"], 2072 ["purple", "紫の", "42"]]; 2073 2074 /* 1x200 - Needed for testing bernoulliSkipSampling, invoked with prob < 0.04. */ 2075 string[][] data1x200 = 2076 [["field_a"], 2077 ["000"], ["001"], ["002"], ["003"], ["004"], ["005"], ["006"], ["007"], ["008"], ["009"], 2078 ["010"], ["011"], ["012"], ["013"], ["014"], ["015"], ["016"], ["017"], ["018"], ["019"], 2079 ["020"], ["021"], ["022"], ["023"], ["024"], ["025"], ["026"], ["027"], ["028"], ["029"], 2080 ["030"], ["031"], ["032"], ["033"], ["034"], ["035"], ["036"], ["037"], ["038"], ["039"], 2081 ["040"], ["041"], ["042"], ["043"], ["044"], ["045"], ["046"], ["047"], ["048"], ["049"], 2082 ["050"], ["051"], ["052"], ["053"], ["054"], ["055"], ["056"], ["057"], ["058"], ["059"], 2083 ["060"], ["061"], ["062"], ["063"], ["064"], ["065"], ["066"], ["067"], ["068"], ["069"], 2084 ["070"], ["071"], ["072"], ["073"], ["074"], ["075"], ["076"], ["077"], ["078"], ["079"], 2085 ["080"], ["081"], ["082"], ["083"], ["084"], ["085"], ["086"], ["087"], ["088"], ["089"], 2086 ["090"], ["091"], ["092"], ["093"], ["094"], ["095"], ["096"], ["097"], ["098"], ["099"], 2087 ["100"], ["101"], ["102"], ["103"], ["104"], ["105"], ["106"], ["107"], ["108"], ["109"], 2088 ["110"], ["111"], ["112"], ["113"], ["114"], ["115"], ["116"], ["117"], ["118"], ["119"], 2089 ["120"], ["121"], ["122"], ["123"], ["124"], ["125"], ["126"], ["127"], ["128"], ["129"], 2090 ["130"], ["131"], ["132"], ["133"], ["134"], ["135"], ["136"], ["137"], ["138"], ["139"], 2091 ["140"], ["141"], ["142"], ["143"], ["144"], ["145"], ["146"], ["147"], ["148"], ["149"], 2092 ["150"], ["151"], ["152"], ["153"], ["154"], ["155"], ["156"], ["157"], ["158"], ["159"], 2093 ["160"], ["161"], ["162"], ["163"], ["164"], ["165"], ["166"], ["167"], ["168"], ["169"], 2094 ["170"], ["171"], ["172"], ["173"], ["174"], ["175"], ["176"], ["177"], ["178"], ["179"], 2095 ["180"], ["181"], ["182"], ["183"], ["184"], ["185"], ["186"], ["187"], ["188"], ["189"], 2096 ["190"], ["191"], ["192"], ["193"], ["194"], ["195"], ["196"], ["197"], ["198"], ["199"], 2097 ]; 2098 2099 string fpath_data1x200 = buildPath(testDir, "data1x200.tsv"); 2100 string fpath_data1x200_noheader = buildPath(testDir, "data1x200_noheader.tsv"); 2101 writeUnittestTsvFile(fpath_data1x200, data1x200); 2102 writeUnittestTsvFile(fpath_data1x200_noheader, data1x200[1..$]); 2103 2104 string[][] data1x200ExpectedBernoulliSkipV333P01 = 2105 [["field_a"], 2106 ["077"], 2107 ["119"]]; 2108 2109 string[][] data1x200ExpectedBernoulliSkipV333P02 = 2110 [["field_a"], 2111 ["038"], 2112 ["059"], 2113 ["124"], 2114 ["161"], 2115 ["162"], 2116 ["183"]]; 2117 2118 string[][] data1x200ExpectedBernoulliSkipV333P03 = 2119 [["field_a"], 2120 ["025"], 2121 ["039"], 2122 ["082"], 2123 ["107"], 2124 ["108"], 2125 ["122"], 2126 ["136"], 2127 ["166"], 2128 ["182"]]; 2129 2130 string[][] data1x200ExpectedBernoulliCompatV333P01 = 2131 [["field_a"], 2132 ["072"]]; 2133 2134 string[][] data1x200ExpectedBernoulliCompatV333P02 = 2135 [["field_a"], 2136 ["004"], 2137 ["072"]]; 2138 2139 string[][] data1x200ExpectedBernoulliCompatV333P03 = 2140 [["field_a"], 2141 ["004"], 2142 ["072"], 2143 ["181"]]; 2144 2145 /* Combo 2, for bernoulli skip sampling: 3x0, 3x1, 1x200, empty, 1x10. No data files, 2146 * only expected results. The header is from 3x0, the results are offset 1-position 2147 * from data1x200ExpectedBernoulliSkipV333P03 due to insertion of a single preceding line. 2148 */ 2149 string[][] combo2ExpectedBernoulliSkipV333P03 = 2150 [["field_a", "field_b", "field_c"], 2151 ["024"], 2152 ["038"], 2153 ["081"], 2154 ["106"], 2155 ["107"], 2156 ["121"], 2157 ["135"], 2158 ["165"], 2159 ["181"]]; 2160 2161 2162 /* 1x10 - Simple 1-column file. */ 2163 string[][] data1x10 = 2164 [["field_a"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"], ["10"]]; 2165 string fpath_data1x10 = buildPath(testDir, "data1x10.tsv"); 2166 string fpath_data1x10_noheader = buildPath(testDir, "data1x10_noheader.tsv"); 2167 writeUnittestTsvFile(fpath_data1x10, data1x10); 2168 writeUnittestTsvFile(fpath_data1x10_noheader, data1x10[1..$]); 2169 2170 string[][] data1x10ExpectedPermuteCompat = 2171 [["field_a"], ["8"], ["4"], ["6"], ["5"], ["3"], ["10"], ["7"], ["9"], ["2"], ["1"]]; 2172 2173 string[][] data1x10ExpectedPermuteWt1 = 2174 [["field_a"], ["8"], ["4"], ["6"], ["10"], ["5"], ["7"], ["9"], ["3"], ["2"], ["1"]]; 2175 2176 /* 2x10a - Uniform distribution [0,1]. */ 2177 string[][] data2x10a = 2178 [["line", "weight"], 2179 ["1", "0.26788837"], 2180 ["2", "0.06601298"], 2181 ["3", "0.38627527"], 2182 ["4", "0.47379424"], 2183 ["5", "0.02966641"], 2184 ["6", "0.05636231"], 2185 ["7", "0.70529242"], 2186 ["8", "0.91836862"], 2187 ["9", "0.99103720"], 2188 ["10", "0.31401740"]]; 2189 2190 string fpath_data2x10a = buildPath(testDir, "data2x10a.tsv"); 2191 writeUnittestTsvFile(fpath_data2x10a, data2x10a); 2192 2193 string[][] data2x10aExpectedPermuteWt2Probs = 2194 [["random_value", "line", "weight"], 2195 ["0.96833865494543658", "8", "0.91836862"], 2196 ["0.91856842054413923", "4", "0.47379424"], 2197 ["0.25730832087795091", "7", "0.70529242"], 2198 ["0.23725317907018120", "9", "0.99103720"], 2199 ["0.16016096701872204", "3", "0.38627527"], 2200 ["0.090819662667243381", "10", "0.31401740"], 2201 ["0.0071764539244361172", "6", "0.05636231"], 2202 ["0.000000048318642951630057", "1", "0.26788837"], 2203 ["0.00000000037525692966535517", "5", "0.02966641"], 2204 ["8.2123247880095796e-13", "2", "0.06601298"]]; 2205 2206 /* 2x10b - Uniform distribution [0,1000]. */ 2207 string[][] data2x10b = 2208 [["line", "weight"], 2209 ["1", "761"], 2210 ["2", "432"], 2211 ["3", "103"], 2212 ["4", "448"], 2213 ["5", "750"], 2214 ["6", "711"], 2215 ["7", "867"], 2216 ["8", "841"], 2217 ["9", "963"], 2218 ["10", "784"]]; 2219 2220 string fpath_data2x10b = buildPath(testDir, "data2x10b.tsv"); 2221 writeUnittestTsvFile(fpath_data2x10b, data2x10b); 2222 2223 string[][] data2x10bExpectedPermuteWt2Probs = 2224 [["random_value", "line", "weight"], 2225 ["0.99996486739067969", "8", "841"], 2226 ["0.99991017467137211", "4", "448"], 2227 ["0.99960871524873662", "6", "711"], 2228 ["0.99914188537143800", "5", "750"], 2229 ["0.99903963250274785", "10", "784"], 2230 ["0.99889631825931946", "7", "867"], 2231 ["0.99852058315191139", "9", "963"], 2232 ["0.99575669679158918", "2", "432"], 2233 ["0.99408758732050595", "1", "761"], 2234 ["0.99315467761212362", "3", "103"]]; 2235 2236 /* 2x10c - Logarithmic distribution in random order. */ 2237 string[][] data2x10c = 2238 [["line", "weight"], 2239 ["1", "31.85"], 2240 ["2", "17403.31"], 2241 ["3", "653.84"], 2242 ["4", "8.23"], 2243 ["5", "2671.04"], 2244 ["6", "26226.08"], 2245 ["7", "1.79"], 2246 ["8", "354.56"], 2247 ["9", "35213.81"], 2248 ["10", "679.29"]]; 2249 2250 string fpath_data2x10c = buildPath(testDir, "data2x10c.tsv"); 2251 writeUnittestTsvFile(fpath_data2x10c, data2x10c); 2252 2253 string[][] data2x10cExpectedPermuteWt2Probs = 2254 [["random_value", "line", "weight"], 2255 ["0.99998939008709697", "6", "26226.08"], 2256 ["0.99995951291695517", "9", "35213.81"], 2257 ["0.99991666907613541", "8", "354.56"], 2258 ["0.99989445052186410", "2", "17403.31"], 2259 ["0.99975897602861630", "5", "2671.04"], 2260 ["0.99891852769877643", "3", "653.84"], 2261 ["0.99889167752782515", "10", "679.29"], 2262 ["0.99512207506850148", "4", "8.23"], 2263 ["0.86789371584259023", "1", "31.85"], 2264 ["0.58574438162915610", "7", "1.79"]]; 2265 2266 /* 2x10d. Logarithmic distribution in ascending order. */ 2267 string[][] data2x10d = 2268 [["line", "weight"], 2269 ["1", "1.79"], 2270 ["2", "8.23"], 2271 ["3", "31.85"], 2272 ["4", "354.56"], 2273 ["5", "653.84"], 2274 ["6", "679.29"], 2275 ["7", "2671.04"], 2276 ["8", "17403.31"], 2277 ["9", "26226.08"], 2278 ["10", "35213.81"]]; 2279 2280 string fpath_data2x10d = buildPath(testDir, "data2x10d.tsv"); 2281 writeUnittestTsvFile(fpath_data2x10d, data2x10d); 2282 2283 string[][] data2x10dExpectedPermuteWt2Probs = 2284 [["random_value", "line", "weight"], 2285 ["0.99999830221846353", "8", "17403.31"], 2286 ["0.99997860834041397", "10", "35213.81"], 2287 ["0.99994563828986716", "9", "26226.08"], 2288 ["0.99988650363575737", "4", "354.56"], 2289 ["0.99964161939190088", "7", "2671.04"], 2290 ["0.99959045338948649", "6", "679.29"], 2291 ["0.99901574490639788", "5", "653.84"], 2292 ["0.97803163304747431", "3", "31.85"], 2293 ["0.79994791806910948", "2", "8.23"], 2294 ["0.080374261239949119", "1", "1.79"]]; 2295 2296 /* 2x10e. Logarithmic distribution in descending order. */ 2297 string[][] data2x10e = 2298 [["line", "weight"], 2299 ["1", "35213.81"], 2300 ["2", "26226.08"], 2301 ["3", "17403.31"], 2302 ["4", "2671.04"], 2303 ["5", "679.29"], 2304 ["6", "653.84"], 2305 ["7", "354.56"], 2306 ["8", "31.85"], 2307 ["9", "8.23"], 2308 ["10", "1.79"]]; 2309 string fpath_data2x10e = buildPath(testDir, "data2x10e.tsv"); 2310 writeUnittestTsvFile(fpath_data2x10e, data2x10e); 2311 2312 string[][] data2x10eExpectedPermuteWt2Probs = 2313 [["random_value", "line", "weight"], 2314 ["0.99998493348975237", "4", "2671.04"], 2315 ["0.99995934807202624", "3", "17403.31"], 2316 ["0.99992995739727453", "2", "26226.08"], 2317 ["0.99987185679245649", "1", "35213.81"], 2318 ["0.99957451563173938", "6", "653.84"], 2319 ["0.99907273650209583", "8", "31.85"], 2320 ["0.99905260312968946", "5", "679.29"], 2321 ["0.99730333650516401", "7", "354.56"], 2322 ["0.84093902435227808", "9", "8.23"], 2323 ["0.65650015926290028", "10", "1.79"]]; 2324 2325 /* Data sets for distinct sampling. */ 2326 string[][] data5x25 = 2327 [["ID", "Shape", "Color", "Size", "Weight"], 2328 ["01", "circle", "red", "S", "10"], 2329 ["02", "circle", "black", "L", "20"], 2330 ["03", "square", "black", "L", "20"], 2331 ["04", "circle", "green", "L", "30"], 2332 ["05", "ellipse", "red", "S", "20"], 2333 ["06", "triangle", "red", "S", "10"], 2334 ["07", "triangle", "red", "L", "20"], 2335 ["08", "square", "black", "S", "10"], 2336 ["09", "circle", "black", "S", "20"], 2337 ["10", "square", "green", "L", "20"], 2338 ["11", "triangle", "red", "L", "20"], 2339 ["12", "circle", "green", "L", "30"], 2340 ["13", "ellipse", "red", "S", "20"], 2341 ["14", "circle", "green", "L", "30"], 2342 ["15", "ellipse", "red", "L", "30"], 2343 ["16", "square", "red", "S", "10"], 2344 ["17", "circle", "black", "L", "20"], 2345 ["18", "square", "red", "S", "20"], 2346 ["19", "square", "black", "L", "20"], 2347 ["20", "circle", "red", "S", "10"], 2348 ["21", "ellipse", "black", "L", "30"], 2349 ["22", "triangle", "red", "L", "30"], 2350 ["23", "circle", "green", "S", "20"], 2351 ["24", "square", "green", "L", "20"], 2352 ["25", "circle", "red", "S", "10"], 2353 ]; 2354 2355 string fpath_data5x25 = buildPath(testDir, "data5x25.tsv"); 2356 string fpath_data5x25_noheader = buildPath(testDir, "data5x25_noheader.tsv"); 2357 writeUnittestTsvFile(fpath_data5x25, data5x25); 2358 writeUnittestTsvFile(fpath_data5x25_noheader, data5x25[1..$]); 2359 2360 string[][] data5x25ExpectedDistinctK2P40 = 2361 [["ID", "Shape", "Color", "Size", "Weight"], 2362 ["03", "square", "black", "L", "20"], 2363 ["05", "ellipse", "red", "S", "20"], 2364 ["08", "square", "black", "S", "10"], 2365 ["10", "square", "green", "L", "20"], 2366 ["13", "ellipse", "red", "S", "20"], 2367 ["15", "ellipse", "red", "L", "30"], 2368 ["16", "square", "red", "S", "10"], 2369 ["18", "square", "red", "S", "20"], 2370 ["19", "square", "black", "L", "20"], 2371 ["21", "ellipse", "black", "L", "30"], 2372 ["24", "square", "green", "L", "20"], 2373 ]; 2374 2375 string[][] data5x25ExpectedDistinctK2K4P20 = 2376 [["ID", "Shape", "Color", "Size", "Weight"], 2377 ["03", "square", "black", "L", "20"], 2378 ["07", "triangle", "red", "L", "20"], 2379 ["08", "square", "black", "S", "10"], 2380 ["10", "square", "green", "L", "20"], 2381 ["11", "triangle", "red", "L", "20"], 2382 ["16", "square", "red", "S", "10"], 2383 ["18", "square", "red", "S", "20"], 2384 ["19", "square", "black", "L", "20"], 2385 ["22", "triangle", "red", "L", "30"], 2386 ["24", "square", "green", "L", "20"], 2387 ]; 2388 2389 string[][] data5x25ExpectedDistinctK2K3K4P20 = 2390 [["ID", "Shape", "Color", "Size", "Weight"], 2391 ["04", "circle", "green", "L", "30"], 2392 ["07", "triangle", "red", "L", "20"], 2393 ["09", "circle", "black", "S", "20"], 2394 ["11", "triangle", "red", "L", "20"], 2395 ["12", "circle", "green", "L", "30"], 2396 ["14", "circle", "green", "L", "30"], 2397 ["16", "square", "red", "S", "10"], 2398 ["18", "square", "red", "S", "20"], 2399 ["22", "triangle", "red", "L", "30"], 2400 ]; 2401 2402 /* Fields 2 and 4 from data5x25. Distinct rows should be the same for equiv keys. */ 2403 string[][] data2x25 = 2404 [["Shape", "Size"], 2405 ["circle", "S"], 2406 ["circle", "L"], 2407 ["square", "L"], 2408 ["circle", "L"], 2409 ["ellipse", "S"], 2410 ["triangle", "S"], 2411 ["triangle", "L"], 2412 ["square", "S"], 2413 ["circle", "S"], 2414 ["square", "L"], 2415 ["triangle", "L"], 2416 ["circle", "L"], 2417 ["ellipse", "S"], 2418 ["circle", "L"], 2419 ["ellipse", "L"], 2420 ["square", "S"], 2421 ["circle", "L"], 2422 ["square", "S"], 2423 ["square", "L"], 2424 ["circle", "S"], 2425 ["ellipse", "L"], 2426 ["triangle", "L"], 2427 ["circle", "S"], 2428 ["square", "L"], 2429 ["circle", "S"], 2430 ]; 2431 2432 string fpath_data2x25 = buildPath(testDir, "data2x25.tsv"); 2433 string fpath_data2x25_noheader = buildPath(testDir, "data2x25_noheader.tsv"); 2434 writeUnittestTsvFile(fpath_data2x25, data2x25); 2435 writeUnittestTsvFile(fpath_data2x25_noheader, data2x25[1..$]); 2436 2437 string[][] data2x25ExpectedDistinctK1K2P20 = 2438 [["Shape", "Size"], 2439 ["square", "L"], 2440 ["triangle", "L"], 2441 ["square", "S"], 2442 ["square", "L"], 2443 ["triangle", "L"], 2444 ["square", "S"], 2445 ["square", "S"], 2446 ["square", "L"], 2447 ["triangle", "L"], 2448 ["square", "L"], 2449 ]; 2450 2451 string[][] data1x25 = 2452 [["Shape-Size"], 2453 ["circle-S"], 2454 ["circle-L"], 2455 ["square-L"], 2456 ["circle-L"], 2457 ["ellipse-S"], 2458 ["triangle-S"], 2459 ["triangle-L"], 2460 ["square-S"], 2461 ["circle-S"], 2462 ["square-L"], 2463 ["triangle-L"], 2464 ["circle-L"], 2465 ["ellipse-S"], 2466 ["circle-L"], 2467 ["ellipse-L"], 2468 ["square-S"], 2469 ["circle-L"], 2470 ["square-S"], 2471 ["square-L"], 2472 ["circle-S"], 2473 ["ellipse-L"], 2474 ["triangle-L"], 2475 ["circle-S"], 2476 ["square-L"], 2477 ["circle-S"], 2478 ]; 2479 2480 string fpath_data1x25 = buildPath(testDir, "data1x25.tsv"); 2481 string fpath_data1x25_noheader = buildPath(testDir, "data1x25_noheader.tsv"); 2482 writeUnittestTsvFile(fpath_data1x25, data1x25); 2483 writeUnittestTsvFile(fpath_data1x25_noheader, data1x25[1..$]); 2484 2485 string[][] data1x25ExpectedDistinctK1P20 = 2486 [["Shape-Size"], 2487 ["triangle-L"], 2488 ["square-S"], 2489 ["triangle-L"], 2490 ["ellipse-L"], 2491 ["square-S"], 2492 ["square-S"], 2493 ["ellipse-L"], 2494 ["triangle-L"], 2495 ]; 2496 2497 string[][] data1x25ExpectedDistinctK1P20Probs = 2498 [["random_value", "Shape-Size"], 2499 ["0", "triangle-L"], 2500 ["0", "square-S"], 2501 ["0", "triangle-L"], 2502 ["0", "ellipse-L"], 2503 ["0", "square-S"], 2504 ["0", "square-S"], 2505 ["0", "ellipse-L"], 2506 ["0", "triangle-L"], 2507 ]; 2508 2509 string[][] data1x25ExpectedDistinctK1P20ProbsInorder = 2510 [["random_value", "Shape-Size"], 2511 ["1", "circle-S"], 2512 ["4", "circle-L"], 2513 ["2", "square-L"], 2514 ["4", "circle-L"], 2515 ["2", "ellipse-S"], 2516 ["1", "triangle-S"], 2517 ["0", "triangle-L"], 2518 ["0", "square-S"], 2519 ["1", "circle-S"], 2520 ["2", "square-L"], 2521 ["0", "triangle-L"], 2522 ["4", "circle-L"], 2523 ["2", "ellipse-S"], 2524 ["4", "circle-L"], 2525 ["0", "ellipse-L"], 2526 ["0", "square-S"], 2527 ["4", "circle-L"], 2528 ["0", "square-S"], 2529 ["2", "square-L"], 2530 ["1", "circle-S"], 2531 ["0", "ellipse-L"], 2532 ["0", "triangle-L"], 2533 ["1", "circle-S"], 2534 ["2", "square-L"], 2535 ["1", "circle-S"], 2536 ]; 2537 2538 /* 2539 * Enough setup! Actually run some tests! 2540 */ 2541 2542 /* Permutations. Headers, static seed, compatibility mode. With weights and without. */ 2543 testTsvSample(["test-a1", "--header", "--static-seed", "--compatibility-mode", fpath_dataEmpty], dataEmpty); 2544 testTsvSample(["test-a2", "--header", "--static-seed", "--compatibility-mode", fpath_data3x0], data3x0); 2545 testTsvSample(["test-a3", "-H", "-s", "--compatibility-mode", fpath_data3x1], data3x1); 2546 testTsvSample(["test-a4", "-H", "-s", "--compatibility-mode", fpath_data3x2], data3x2PermuteCompat); 2547 testTsvSample(["test-a5", "-H", "-s", "--compatibility-mode", fpath_data3x3], data3x3ExpectedPermuteCompat); 2548 testTsvSample(["test-a6", "-H", "-s", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat); 2549 testTsvSample(["test-a7", "-H", "-s", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs); 2550 testTsvSample(["test-a8", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3); 2551 testTsvSample(["test-a9", "-H", "-s", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 2552 testTsvSample(["test-a10", "-H", "--seed-value", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); 2553 testTsvSample(["test-a11", "-H", "-s", "-v", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); 2554 testTsvSample(["test-a12", "-H", "-s", "-v", "0", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs); 2555 testTsvSample(["test-a13", "-H", "-v", "41", "-w", "3", "--print-random", fpath_data3x6], data3x6ExpectedPermuteWt3V41Probs); 2556 2557 /* Permutations, without compatibility mode, or with both compatibility and printing. */ 2558 testTsvSample(["test-aa1", "--header", "--static-seed", fpath_dataEmpty], dataEmpty); 2559 testTsvSample(["test-aa2", "--header", "--static-seed", fpath_data3x0], data3x0); 2560 testTsvSample(["test-aa3", "-H", "-s", fpath_data3x1], data3x1); 2561 testTsvSample(["test-aa4", "-H", "-s", fpath_data3x2], data3x2PermuteShuffle); 2562 testTsvSample(["test-aa5", "-H", "-s", fpath_data3x3], data3x3ExpectedPermuteSwap); 2563 testTsvSample(["test-aa6", "-H", "-s", fpath_data3x6], data3x6ExpectedPermuteSwap); 2564 testTsvSample(["test-aa7", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3); 2565 testTsvSample(["test-aa8", "-H", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 2566 testTsvSample(["test-aa9", "-H", "--seed-value", "41", "--print-random", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); 2567 2568 /* Reservoir sampling using Algorithm R. 2569 * (Note: reservoirSamplingViaHeap is tested later in the length-based iteration loops.) 2570 */ 2571 testTsvSample(["test-aa10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty); 2572 testTsvSample(["test-aa11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty); 2573 testTsvSample(["test-aa12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x0], data3x0); 2574 testTsvSample(["test-aa13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x0], data3x0); 2575 testTsvSample(["test-aa14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x1], data3x1); 2576 testTsvSample(["test-aa15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x1], data3x1); 2577 testTsvSample(["test-aa16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum6); 2578 testTsvSample(["test-aa17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum6); 2579 testTsvSample(["test-aa18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum5); 2580 testTsvSample(["test-aa19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum4); 2581 testTsvSample(["test-aa20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum3); 2582 testTsvSample(["test-aa21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum2); 2583 testTsvSample(["test-aa22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum1); 2584 2585 /* Bernoulli sampling cases. */ 2586 testTsvSample(["test-a14", "--header", "--static-seed", "--prob", "0.001", fpath_dataEmpty], dataEmpty); 2587 testTsvSample(["test-a15", "--header", "--static-seed", "--prob", "0.001", fpath_data3x0], data3x0); 2588 testTsvSample(["test-a16", "-H", "-s", "-p", "1.0", fpath_data3x1], data3x1); 2589 testTsvSample(["test-a17", "-H", "-s", "-p", "1.0", fpath_data3x6], data3x6); 2590 testTsvSample(["test-a18", "-H", "-p", "1.0", fpath_data3x6], data3x6); 2591 testTsvSample(["test-a19", "-H", "-s", "--prob", "1.0", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 2592 testTsvSample(["test-a20", "-H", "-s", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60); 2593 testTsvSample(["test-a21", "-H", "-s", "--prob", "0.60", fpath_data3x6], data3x6ExpectedBernoulliCompatP60); 2594 testTsvSample(["test-a22", "-H", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatP60V41Probs); 2595 2596 /* Bernoulli sampling with probabilities in skip sampling range or preferring skip sampling. */ 2597 testTsvSample(["test-ab1", "-H", "--seed-value", "333", "--prob", "0.01", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P01); 2598 testTsvSample(["test-ab2", "-H", "--seed-value", "333", "--prob", "0.02", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P02); 2599 testTsvSample(["test-ab3", "-H", "--seed-value", "333", "--prob", "0.03", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03); 2600 testTsvSample(["test-ab4", "-H", "--seed-value", "333", "--prob", "0.01", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P01); 2601 testTsvSample(["test-ab5", "-H", "--seed-value", "333", "--prob", "0.02", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P02); 2602 testTsvSample(["test-ab6", "-H", "--seed-value", "333", "--prob", "0.03", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P03); 2603 testTsvSample(["test-ab7", "-H", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6], data3x6ExpectedBernoulliSkipP40); 2604 2605 /* Distinct sampling cases. */ 2606 testTsvSample(["test-a23", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_dataEmpty], dataEmpty); 2607 testTsvSample(["test-a24", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_data3x0], data3x0); 2608 testTsvSample(["test-a25", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x1], data3x1); 2609 testTsvSample(["test-a26", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x6], data3x6); 2610 testTsvSample(["test-a27", "-H", "-s", "-p", "0.6", "-k", "1,3", fpath_data3x6], data3x6ExpectedDistinctK1K3P60); 2611 2612 2613 2614 /* Generating random weights. Use Bernoulli sampling test set at prob 100% for uniform sampling. 2615 * For weighted sampling, use the weighted cases, but with expected using the original ordering. 2616 */ 2617 testTsvSample(["test-a28", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 2618 testTsvSample(["test-a29", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 2619 testTsvSample(["test-a30", "-H", "-s", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6], 2620 data3x6ExpectedWt3ProbsInorder); 2621 testTsvSample(["test-a31", "-H", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6], 2622 data3x6ExpectedWt3V41ProbsInorder); 2623 testTsvSample(["test-a32", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6], 2624 data3x6ExpectedDistinctK1K3P60Probs); 2625 testTsvSample(["test-a33", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", "--random-value-header", 2626 "custom_random_value_header", fpath_data3x6], data3x6ExpectedDistinctK1K3P60ProbsRVCustom); 2627 testTsvSample(["test-a34", "-H", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6], 2628 data3x6ExpectedDistinctK2P2ProbsInorder); 2629 2630 /* Simple random sampling with replacement. */ 2631 testTsvSample(["test-a35", "-H", "-s", "--replace", fpath_dataEmpty], dataEmpty); 2632 testTsvSample(["test-a36", "-H", "-s", "--replace", "--num", "3", fpath_dataEmpty], dataEmpty); 2633 testTsvSample(["test-a37", "-H", "-s", "--replace", fpath_data3x0], data3x0); 2634 testTsvSample(["test-a38", "-H", "-s", "--replace", "--num", "3", fpath_data3x0], data3x0); 2635 testTsvSample(["test-a39", "-H", "-s", "--replace", "--num", "3", fpath_data3x1], data3x1ExpectedReplaceNum3); 2636 testTsvSample(["test-a40", "-H", "-s", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10); 2637 testTsvSample(["test-a41", "-H", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10V77); 2638 2639 /* Permutations, compatibility mode, without headers. */ 2640 testTsvSample(["test-b1", "-s", "--compatibility-mode", fpath_data3x1_noheader], data3x1[1..$]); 2641 testTsvSample(["test-b2", "-s", "--compatibility-mode", fpath_data3x2_noheader], data3x2PermuteCompat[1..$]); 2642 testTsvSample(["test-b3", "-s", "--compatibility-mode", fpath_data3x3_noheader], data3x3ExpectedPermuteCompat[1..$]); 2643 testTsvSample(["test-b4", "-s", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1..$]); 2644 testTsvSample(["test-b5", "-s", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1..$]); 2645 testTsvSample(["test-b6", "-s", "--weight-field", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..$]); 2646 testTsvSample(["test-b7", "-s", "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..$]); 2647 testTsvSample(["test-b8", "-v", "41", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1..$]); 2648 testTsvSample(["test-b9", "-v", "41", "-w", "3", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3V41Probs[1..$]); 2649 2650 /* Permutations, no headers, without compatibility mode, or with printing and compatibility mode. */ 2651 testTsvSample(["test-bb1", "-s", fpath_data3x1_noheader], data3x1[1..$]); 2652 testTsvSample(["test-bb2", "-s", fpath_data3x2_noheader], data3x2PermuteShuffle[1..$]); 2653 testTsvSample(["test-bb3", "-s", fpath_data3x3_noheader], data3x3ExpectedPermuteSwap[1..$]); 2654 testTsvSample(["test-bb4", "-s", fpath_data3x6_noheader], data3x6ExpectedPermuteSwap[1..$]); 2655 testTsvSample(["test-bb5", "-s", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..$]); 2656 testTsvSample(["test-bb6", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..$]); 2657 testTsvSample(["test-bb7", "-v", "41", "--print-random", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1..$]); 2658 2659 /* Reservoir sampling using Algorithm R, no headers. */ 2660 testTsvSample(["test-aa10", "--prefer-algorithm-r", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty); 2661 testTsvSample(["test-aa11", "--prefer-algorithm-r", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty); 2662 testTsvSample(["test-aa14", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x1_noheader], data3x1[1..$]); 2663 testTsvSample(["test-aa15", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x1_noheader], data3x1[1..$]); 2664 testTsvSample(["test-aa16", "--prefer-algorithm-r", "-s", "--num", "7", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum6[1..$]); 2665 testTsvSample(["test-aa17", "--prefer-algorithm-r", "-s", "--num", "6", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum6[1..$]); 2666 testTsvSample(["test-aa18", "--prefer-algorithm-r", "-s", "--num", "5", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum5[1..$]); 2667 testTsvSample(["test-aa19", "--prefer-algorithm-r", "-s", "--num", "4", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum4[1..$]); 2668 testTsvSample(["test-aa20", "--prefer-algorithm-r", "-s", "--num", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum3[1..$]); 2669 testTsvSample(["test-aa21", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum2[1..$]); 2670 testTsvSample(["test-aa22", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum1[1..$]); 2671 2672 /* Bernoulli sampling cases. */ 2673 testTsvSample(["test-b10", "-s", "-p", "1.0", fpath_data3x1_noheader], data3x1[1..$]); 2674 testTsvSample(["test-b11", "-s", "-p", "1.0", fpath_data3x6_noheader], data3x6[1..$]); 2675 testTsvSample(["test-b12", "-p", "1.0", fpath_data3x6_noheader], data3x6[1..$]); 2676 testTsvSample(["test-b13", "-s", "--prob", "1.0", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..$]); 2677 testTsvSample(["test-b14", "-s", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1..$]); 2678 testTsvSample(["test-b15", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60V41Probs[1..$]); 2679 2680 /* Bernoulli sampling with probabilities in skip sampling range. */ 2681 testTsvSample(["test-bb1", "-v", "333", "-p", "0.01", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P01[1..$]); 2682 testTsvSample(["test-bb2", "-v", "333", "-p", "0.02", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P02[1..$]); 2683 testTsvSample(["test-bb3", "-v", "333", "-p", "0.03", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..$]); 2684 testTsvSample(["test-bb4", "-v", "333", "-p", "0.01", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P01[1..$]); 2685 testTsvSample(["test-bb5", "-v", "333", "-p", "0.02", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P02[1..$]); 2686 testTsvSample(["test-bb6", "-v", "333", "-p", "0.03", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P03[1..$]); 2687 testTsvSample(["test-bb7", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6_noheader], data3x6ExpectedBernoulliSkipP40[1..$]); 2688 2689 /* Distinct sampling cases. */ 2690 testTsvSample(["test-b16", "-s", "-p", "1.0", "-k", "2", fpath_data3x1_noheader], data3x1[1..$]); 2691 testTsvSample(["test-b17", "-s", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]); 2692 testTsvSample(["test-b18", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]); 2693 testTsvSample(["test-b19", "-v", "71563", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]); 2694 2695 /* Generating random weights. Reuse Bernoulli sampling tests at prob 100%. */ 2696 testTsvSample(["test-b20", "-s", "--gen-random-inorder", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..$]); 2697 testTsvSample(["test-b23", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedWt3V41ProbsInorder[1..$]); 2698 testTsvSample(["test-b24", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6_noheader], 2699 data3x6ExpectedDistinctK1K3P60Probs[1..$]); 2700 testTsvSample(["test-b24", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6_noheader], 2701 data3x6ExpectedDistinctK2P2ProbsInorder[1..$]); 2702 2703 /* Simple random sampling with replacement. */ 2704 testTsvSample(["test-b25", "-s", "--replace", fpath_dataEmpty], dataEmpty); 2705 testTsvSample(["test-b26", "-s", "-r", "--num", "3", fpath_dataEmpty], dataEmpty); 2706 testTsvSample(["test-b27", "-s", "-r", "-n", "3", fpath_data3x1_noheader], data3x1ExpectedReplaceNum3[1..$]); 2707 testTsvSample(["test-b28", "-s", "--replace", "-n", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10[1..$]); 2708 testTsvSample(["test-b29", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10V77[1..$]); 2709 2710 /* Multi-file tests. */ 2711 testTsvSample(["test-c1", "--header", "--static-seed", "--compatibility-mode", 2712 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2713 combo1ExpectedPermuteCompat); 2714 testTsvSample(["test-c2", "--header", "--static-seed", "--print-random", 2715 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2716 combo1ExpectedPermuteCompatProbs); 2717 testTsvSample(["test-c3", "--header", "--static-seed", "--print-random", "--weight-field", "3", 2718 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2719 combo1ExpectedPermuteWt3Probs); 2720 testTsvSample(["test-c4", "--header", "--static-seed", "--weight-field", "3", "--compatibility-mode", 2721 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2722 combo1ExpectedPermuteWt3); 2723 testTsvSample(["test-c5", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4", 2724 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2725 combo1ExpectedPermuteAlgoRNum4); 2726 2727 /* Multi-file, no headers. */ 2728 testTsvSample(["test-c6", "--static-seed", "--compatibility-mode", 2729 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2730 fpath_data3x6_noheader, fpath_data3x2_noheader], 2731 combo1ExpectedPermuteCompat[1..$]); 2732 testTsvSample(["test-c7", "--static-seed", "--print-random", 2733 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2734 fpath_data3x6_noheader, fpath_data3x2_noheader], 2735 combo1ExpectedPermuteCompatProbs[1..$]); 2736 testTsvSample(["test-c8", "--static-seed", "--print-random", "--weight-field", "3", 2737 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2738 fpath_data3x6_noheader, fpath_data3x2_noheader], 2739 combo1ExpectedPermuteWt3Probs[1..$]); 2740 testTsvSample(["test-c9", "--static-seed", "--weight-field", "3", "--compatibility-mode", 2741 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2742 fpath_data3x6_noheader, fpath_data3x2_noheader], 2743 combo1ExpectedPermuteWt3[1..$]); 2744 testTsvSample(["test-c10", "--static-seed", "--prefer-algorithm-r", "--num", "4", 2745 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2746 fpath_data3x6_noheader, fpath_data3x2_noheader], 2747 combo1ExpectedPermuteAlgoRNum4[1..$]); 2748 2749 /* Bernoulli sampling cases. */ 2750 testTsvSample(["test-c11", "--header", "--static-seed", "--print-random", "--prob", ".5", 2751 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2752 combo1ExpectedBernoulliCompatP50Probs); 2753 testTsvSample(["test-c12", "--header", "--static-seed", "--prob", ".4", 2754 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2755 combo1ExpectedBernoulliCompatP40); 2756 testTsvSample(["test-c13", "--static-seed", "--print-random", "--prob", ".5", 2757 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2758 fpath_data3x6_noheader, fpath_data3x2_noheader], 2759 combo1ExpectedBernoulliCompatP50Probs[1..$]); 2760 testTsvSample(["test-c14", "--static-seed", "--prob", ".4", 2761 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2762 fpath_data3x6_noheader, fpath_data3x2_noheader], 2763 combo1ExpectedBernoulliCompatP40[1..$]); 2764 2765 /* Bernoulli sampling with probabilities in skip sampling range. */ 2766 testTsvSample(["test-cc1", "-H", "-v", "333", "-p", "0.03", 2767 fpath_data3x0, fpath_data3x1, fpath_data1x200, fpath_dataEmpty, fpath_data1x10], 2768 combo2ExpectedBernoulliSkipV333P03); 2769 testTsvSample(["test-cc1", "-v", "333", "-p", "0.03", 2770 fpath_data3x1_noheader, fpath_data1x200_noheader, fpath_dataEmpty, fpath_data1x10_noheader], 2771 combo2ExpectedBernoulliSkipV333P03[1..$]); 2772 2773 /* Distinct sampling cases. */ 2774 testTsvSample(["test-c13", "--header", "--static-seed", "--key-fields", "1", "--prob", ".4", 2775 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2776 combo1ExpectedDistinctK1P40); 2777 testTsvSample(["test-c14", "--static-seed", "--key-fields", "1", "--prob", ".4", 2778 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2779 fpath_data3x6_noheader, fpath_data3x2_noheader], 2780 combo1ExpectedDistinctK1P40[1..$]); 2781 2782 /* Generating random weights. */ 2783 testTsvSample(["test-c15", "--header", "--static-seed", "--gen-random-inorder", 2784 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2785 combo1ExpectedProbsInorder); 2786 testTsvSample(["test-c16", "--static-seed", "--gen-random-inorder", 2787 fpath_data3x3_noheader, fpath_data3x1_noheader, 2788 fpath_dataEmpty, fpath_data3x6_noheader, fpath_data3x2_noheader], 2789 combo1ExpectedProbsInorder[1..$]); 2790 2791 /* Simple random sampling with replacement. */ 2792 testTsvSample(["test-c17", "--header", "--static-seed", "--replace", "--num", "10", 2793 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2794 combo1ExpectedReplaceNum10); 2795 2796 testTsvSample(["test-c18", "--static-seed", "--replace", "--num", "10", 2797 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2798 fpath_data3x6_noheader, fpath_data3x2_noheader], 2799 combo1ExpectedReplaceNum10[1..$]); 2800 2801 /* Single column file. */ 2802 testTsvSample(["test-d1", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat); 2803 testTsvSample(["test-d2", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat); 2804 2805 /* Distributions. */ 2806 testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10a], data2x10aExpectedPermuteWt2Probs); 2807 testTsvSample(["test-e2", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10b], data2x10bExpectedPermuteWt2Probs); 2808 testTsvSample(["test-e3", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10c], data2x10cExpectedPermuteWt2Probs); 2809 testTsvSample(["test-e4", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10d], data2x10dExpectedPermuteWt2Probs); 2810 testTsvSample(["test-e5", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10e], data2x10eExpectedPermuteWt2Probs); 2811 2812 /* Tests of subset sample (--n|num) field. 2813 * 2814 * Note: The way these tests are done ensures that subset length does not affect 2815 * output order. 2816 */ 2817 import std.algorithm : min; 2818 for (size_t n = data3x6.length + 2; n >= 1; n--) 2819 { 2820 /* reservoirSamplingViaHeap. 2821 */ 2822 size_t expectedLength = min(data3x6.length, n + 1); 2823 testTsvSample([format("test-f1_%d", n), "-s", "-n", n.to!string, 2824 "-H", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]); 2825 2826 testTsvSample([format("test-f2_%d", n), "-s", "-n", n.to!string, 2827 "-H", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]); 2828 2829 testTsvSample([format("test-f3_%d", n), "-s", "-n", n.to!string, 2830 "-H", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs[0..expectedLength]); 2831 2832 testTsvSample([format("test-f4_%d", n), "-s", "-n", n.to!string, 2833 "-H", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3[0..expectedLength]); 2834 2835 testTsvSample([format("test-f5_%d", n), "-s", "-n", n.to!string, 2836 "-H", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs[0..expectedLength]); 2837 2838 testTsvSample([format("test-f6_%d", n), "-s", "-n", n.to!string, 2839 fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1..expectedLength]); 2840 2841 testTsvSample([format("test-f7_%d", n), "-s", "-n", n.to!string, 2842 "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1..expectedLength]); 2843 2844 testTsvSample([format("test-f8_%d", n), "-s", "-n", n.to!string, 2845 "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..expectedLength]); 2846 2847 testTsvSample([format("test-f9_%d", n), "-s", "-n", n.to!string, 2848 "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..expectedLength]); 2849 2850 /* Bernoulli sampling. 2851 */ 2852 import std.algorithm : min; 2853 size_t sampleExpectedLength = min(expectedLength, data3x6ExpectedBernoulliCompatProbsP60.length); 2854 2855 testTsvSample([format("test-f10_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 2856 "-H", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60[0..sampleExpectedLength]); 2857 2858 testTsvSample([format("test-f11_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 2859 "-H", fpath_data3x6], data3x6ExpectedBernoulliCompatP60[0..sampleExpectedLength]); 2860 2861 testTsvSample([format("test-f12_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 2862 "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1..sampleExpectedLength]); 2863 2864 testTsvSample([format("test-f13_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 2865 fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60[1..sampleExpectedLength]); 2866 2867 /* Distinct Sampling. 2868 */ 2869 size_t distinctExpectedLength = min(expectedLength, data3x6ExpectedDistinctK1K3P60.length); 2870 2871 testTsvSample([format("test-f14_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string, 2872 "-H", fpath_data3x6], data3x6ExpectedDistinctK1K3P60[0..distinctExpectedLength]); 2873 2874 testTsvSample([format("test-f15_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string, 2875 fpath_data3x6_noheader], data3x6ExpectedDistinctK1K3P60[1..distinctExpectedLength]); 2876 2877 testTsvSample([format("test-f16_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string, 2878 "-H", fpath_data3x6], data3x6ExpectedBernoulliProbsP100[0..expectedLength]); 2879 2880 testTsvSample([format("test-f17_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string, 2881 fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..expectedLength]); 2882 } 2883 2884 /* Similar tests with the 1x10 data set. */ 2885 for (size_t n = data1x10.length + 2; n >= 1; n--) 2886 { 2887 size_t expectedLength = min(data1x10.length, n + 1); 2888 testTsvSample([format("test-g1_%d", n), "-s", "-n", n.to!string, 2889 "-H", fpath_data1x10], data1x10ExpectedPermuteCompat[0..expectedLength]); 2890 2891 testTsvSample([format("test-g2_%d", n), "-s", "-n", n.to!string, 2892 "-H", "-w", "1", fpath_data1x10], data1x10ExpectedPermuteWt1[0..expectedLength]); 2893 2894 testTsvSample([format("test-g3_%d", n), "-s", "-n", n.to!string, 2895 fpath_data1x10_noheader], data1x10ExpectedPermuteCompat[1..expectedLength]); 2896 2897 testTsvSample([format("test-g4_%d", n), "-s", "-n", n.to!string, 2898 "-w", "1", fpath_data1x10_noheader], data1x10ExpectedPermuteWt1[1..expectedLength]); 2899 } 2900 2901 /* Simple random sampling with replacement: ensure sample size doesn't change order. */ 2902 for (size_t n = data3x6ExpectedReplaceNum10.length - 1; n >= 1; n--) 2903 { 2904 testTsvSample([format("test-h1_%d", n), "-s", "--replace", "-n", n.to!string, "-H", fpath_data3x6], 2905 data3x6ExpectedReplaceNum10[0 .. n + 1]); 2906 2907 testTsvSample([format("test-h2_%d", n), "-s", "--replace", "-n", n.to!string, fpath_data3x6_noheader], 2908 data3x6ExpectedReplaceNum10[1 .. n + 1]); 2909 } 2910 2911 /* Bernoulli skip sampling. Test with lengths both greater than and less than expected. */ 2912 for (size_t n = data1x200ExpectedBernoulliSkipV333P03.length + 2; n >= 1; n--) 2913 { 2914 size_t expectedLength = min(data1x200ExpectedBernoulliSkipV333P03.length, n + 1); 2915 2916 testTsvSample([format("test-i1_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string, 2917 "-H", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03[0..expectedLength]); 2918 2919 testTsvSample([format("test-i2_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string, 2920 fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..expectedLength]); 2921 } 2922 2923 2924 /* Distinct sampling tests. */ 2925 testTsvSample(["test-j1", "--header", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25], 2926 data5x25ExpectedDistinctK2P40); 2927 2928 testTsvSample(["test-j2", "-H", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25], 2929 data5x25ExpectedDistinctK2K4P20); 2930 2931 testTsvSample(["test-j3", "-H", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25], 2932 data5x25ExpectedDistinctK2K3K4P20); 2933 2934 testTsvSample(["test-j4", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25_noheader], 2935 data5x25ExpectedDistinctK2P40[1..$]); 2936 2937 testTsvSample(["test-j5", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25_noheader], 2938 data5x25ExpectedDistinctK2K4P20[1..$]); 2939 2940 testTsvSample(["test-j6", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25_noheader], 2941 data5x25ExpectedDistinctK2K3K4P20[1..$]); 2942 2943 2944 /* These distinct tests check that the whole line as '-k 0' and specifying all fields 2945 * in order have the same result. Also that field numbers don't matter, as '-k 1,2' 2946 * in data2x25 are the same keys as '-k 2,4' in data5x25. 2947 */ 2948 testTsvSample(["test-j7", "-H", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25], 2949 data2x25ExpectedDistinctK1K2P20); 2950 2951 testTsvSample(["test-j8", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data2x25], 2952 data2x25ExpectedDistinctK1K2P20); 2953 2954 testTsvSample(["test-j9", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25_noheader], 2955 data2x25ExpectedDistinctK1K2P20[1..$]); 2956 2957 testTsvSample(["test-j10", "-s", "-p", "0.20", "-k", "0", fpath_data2x25_noheader], 2958 data2x25ExpectedDistinctK1K2P20[1..$]); 2959 2960 /* Similar to the last set, but for a 1-column file. Also with random value printing. */ 2961 testTsvSample(["test-j11", "-H", "-s", "-p", "0.20", "-k", "1", fpath_data1x25], 2962 data1x25ExpectedDistinctK1P20); 2963 2964 testTsvSample(["test-j12", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data1x25], 2965 data1x25ExpectedDistinctK1P20); 2966 2967 testTsvSample(["test-j13", "-s", "-p", "0.20", "-k", "1", fpath_data1x25_noheader], 2968 data1x25ExpectedDistinctK1P20[1..$]); 2969 2970 testTsvSample(["test-j14", "-s", "-p", "0.20", "-k", "0", fpath_data1x25_noheader], 2971 data1x25ExpectedDistinctK1P20[1..$]); 2972 2973 2974 testTsvSample(["test-j15", "-H", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25], 2975 data1x25ExpectedDistinctK1P20Probs); 2976 2977 testTsvSample(["test-j16", "-H", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25], 2978 data1x25ExpectedDistinctK1P20Probs); 2979 2980 testTsvSample(["test-j17", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25_noheader], 2981 data1x25ExpectedDistinctK1P20Probs[1..$]); 2982 2983 testTsvSample(["test-j18", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25_noheader], 2984 data1x25ExpectedDistinctK1P20Probs[1..$]); 2985 2986 2987 testTsvSample(["test-j19", "-H", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25], 2988 data1x25ExpectedDistinctK1P20ProbsInorder); 2989 2990 testTsvSample(["test-j20", "-H", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25], 2991 data1x25ExpectedDistinctK1P20ProbsInorder); 2992 2993 testTsvSample(["test-j21", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25_noheader], 2994 data1x25ExpectedDistinctK1P20ProbsInorder[1..$]); 2995 2996 testTsvSample(["test-j22", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25_noheader], 2997 data1x25ExpectedDistinctK1P20ProbsInorder[1..$]); 2998 2999 }