1 /** 2 Command line tool for randomizing or sampling lines from input streams. Several 3 sampling methods are available, including simple random sampling, weighted random 4 sampling, Bernoulli sampling, and distinct sampling. 5 6 Copyright (c) 2017-2019, eBay Software Foundation 7 Initially written by Jon Degenhardt 8 9 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 10 */ 11 module tsv_utils.tsv_sample; 12 13 import std.range; 14 import std.stdio; 15 import std.typecons : tuple, Flag; 16 17 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 18 19 version(unittest) 20 { 21 // When running unit tests, use main from -main compiler switch. 22 } 23 else 24 { 25 /** Main program. 26 * 27 * Invokes command line argument processing and calls tsvSample to do the real 28 * work. Errors occurring during processing are caught and reported to the user. 29 */ 30 int main(string[] cmdArgs) 31 { 32 /* When running in DMD code coverage mode, turn on report merging. */ 33 version(D_Coverage) version(DigitalMars) 34 { 35 import core.runtime : dmd_coverSetMerge; 36 dmd_coverSetMerge(true); 37 } 38 39 TsvSampleOptions cmdopt; 40 const r = cmdopt.processArgs(cmdArgs); 41 if (!r[0]) return r[1]; 42 version(LDC_Profile) 43 { 44 import ldc.profile : resetAll; 45 resetAll(); 46 } 47 try 48 { 49 import tsv_utils.common.utils : BufferedOutputRange; 50 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 51 52 tsvSample(cmdopt, bufferedOutput); 53 } 54 catch (Exception exc) 55 { 56 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 57 return 1; 58 } 59 return 0; 60 } 61 } 62 63 immutable helpText = q"EOS 64 Synopsis: tsv-sample [options] [file...] 65 66 Sample input lines or randomize their order. Several modes of operation 67 are available: 68 * Line order randomization (the default): All input lines are output in a 69 random order. All orderings are equally likely. 70 * Weighted line order randomization (--w|weight-field): Lines are selected 71 using weighted random sampling, with the weight taken from a field. 72 Lines are output in weighted selection order, reordering the lines. 73 * Sampling with replacement (--r|replace, --n|num): All input is read into 74 memory, then lines are repeatedly selected at random and written out. This 75 continues until --n|num samples are output. Lines can be selected multiple 76 times. Output continues forever if --n|num is zero or not specified. 77 * Bernoulli sampling (--p|prob): A random subset of lines is output based 78 on an inclusion probability. This is a streaming operation. A selection 79 decision is made on each line as is it read. Line order is not changed. 80 * Distinct sampling (--k|key-fields, --p|prob): Input lines are sampled 81 based on the values in the key field. A subset of the keys are chosen 82 based on the inclusion probability (a 'distinct' set of keys). All lines 83 with one of the selected keys are output. Line order is not changed. 84 85 The '--n|num' option limits the sample size produced. It speeds up line 86 order randomization and weighted sampling significantly. It is also used 87 to terminate sampling with replacement. 88 89 Use '--help-verbose' for detailed information. 90 91 Options: 92 EOS"; 93 94 immutable helpTextVerbose = q"EOS 95 Synopsis: tsv-sample [options] [file...] 96 97 Sample input lines or randomize their order. Several modes of operation 98 are available: 99 * Line order randomization (the default): All input lines are output in a 100 random order. All orderings are equally likely. 101 * Weighted line order randomization (--w|weight-field): Lines are selected 102 using weighted random sampling, with the weight taken from a field. 103 Lines are output in weighted selection order, reordering the lines. 104 * Sampling with replacement (--r|replace, --n|num): All input is read into 105 memory, then lines are repeatedly selected at random and written out. This 106 continues until --n|num samples are output. Lines can be selected multiple 107 times. Output continues forever if --n|num is zero or not specified. 108 * Bernoulli sampling (--p|prob): A random subset of lines is output based 109 on an inclusion probability. This is a streaming operation. A selection 110 decision is made on each line as is it read. Lines order is not changed. 111 * Distinct sampling (--k|key-fields, --p|prob): Input lines are sampled 112 based on the values in the key field. A subset of the keys are chosen 113 based on the inclusion probability (a 'distinct' set of keys). All lines 114 with one of the selected keys are output. Line order is not changed. 115 116 Sample size: The '--n|num' option limits the sample size produced. This 117 speeds up line order randomization and weighted sampling significantly 118 (details below). It is also used to terminate sampling with replacement. 119 120 Controlling the random seed: By default, each run produces a different 121 randomization or sampling. Using '--s|static-seed' changes this so 122 multiple runs produce the same results. This works by using the same 123 random seed each run. The random seed can be specified using 124 '--v|seed-value'. This takes a non-zero, 32-bit positive integer. (A zero 125 value is a no-op and ignored.) 126 127 Memory use: Bernoulli sampling and distinct sampling make decisions on 128 each line as it is read, so there is no memory accumulation. These 129 algorithms support arbitrary size inputs. Sampling with replacement reads 130 all lines into memory and is limited by available memory. The line order 131 randomization algorithms hold the full output set in memory prior to 132 generating results. This ultimately limits the size of the output set. For 133 these memory needs can be reduced by using a sample size (--n|num). This 134 engages reservoir sampling. Output order is not affected. Both 135 'tsv-sample -n 1000' and 'tsv-sample | head -n 1000' produce the same 136 results, but the former is quite a bit faster. 137 138 Weighted sampling: Weighted random sampling is done using an algorithm 139 described by Pavlos Efraimidis and Paul Spirakis. Weights should be 140 positive values representing the relative weight of the entry in the 141 collection. Counts and similar can be used as weights, it is *not* 142 necessary to normalize to a [0,1] interval. Negative values are not 143 meaningful and given the value zero. Input order is not retained, instead 144 lines are output ordered by the randomized weight that was assigned. This 145 means that a smaller valid sample can be produced by taking the first N 146 lines of output. For more info on the sampling approach see: 147 * Wikipedia: https://en.wikipedia.org/wiki/Reservoir_sampling 148 * "Weighted Random Sampling over Data Streams", Pavlos S. Efraimidis 149 (https://arxiv.org/abs/1012.0256) 150 151 Printing random values: Most of the sampling algorithms work by generating 152 a random value for each line. (See "Compatibility mode" below.) The nature 153 of these values depends on the sampling algorithm. They are used for both 154 line selection and output ordering. The '--p|print-random' option can be 155 used to print these values. The random value is prepended to the line 156 separated by the --d|delimiter char (TAB by default). The 157 '--q|gen-random-inorder' option takes this one step further, generating 158 random values for all input lines without changing the input order. The 159 types of values currently used by these sampling algorithms: 160 * Unweighted sampling: Uniform random value in the interval [0,1]. This 161 includes Bernoulli sampling and unweighted line order randomization. 162 * Weighted sampling: Value in the interval [0,1]. Distribution depends on 163 the values in the weight field. It is used as a partial ordering. 164 * Distinct sampling: An integer, zero and up, representing a selection 165 group. The inclusion probability determines the number of selection groups. 166 * Sampling with replacement: Random value printing is not supported. 167 168 The specifics behind these random values are subject to change in future 169 releases. 170 171 Compatibility mode: As described above, many of the sampling algorithms 172 assign a random value to each line. This is useful when printing random 173 values. It has another occasionally useful property: repeated runs with 174 the same static seed but different selection parameters are more 175 compatible with each other, as each line gets assigned the same random 176 value on every run. For example, if Bernoulli sampling is run with 177 '--prob 0.2 --static-seed', then run again with '--prob 0.3 --static-seed', 178 all the lines selected in the first run will be selected in the second. 179 This comes at a cost: in some cases there are faster algorithms that don't 180 preserve this property. By default, tsv-sample will use faster algorithms 181 when available. However, the '--compatibility-mode' option switches to 182 algorithms that assign a random value per line. Printing random values 183 also engages compatibility mode. 184 185 Options: 186 EOS"; 187 188 /** Container for command line options and derived data. 189 * 190 * TsvSampleOptions handles several aspects of command line options. On the input side, 191 * it defines the command line options available, performs validation, and sets up any 192 * derived state based on the options provided. These activities are handled by the 193 * processArgs() member. 194 * 195 * Once argument processing is complete, the TsvSampleOptions is used as a container 196 * holding the specific processing options used by the different sampling routines. 197 */ 198 struct TsvSampleOptions 199 { 200 string programName; /// Program name 201 string[] files; /// Input files 202 bool helpVerbose = false; /// --help-verbose 203 bool hasHeader = false; /// --H|header 204 size_t sampleSize = 0; /// --n|num - Size of the desired sample 205 double inclusionProbability = double.nan; /// --p|prob - Inclusion probability 206 size_t[] keyFields; /// --k|key-fields - Used with inclusion probability 207 size_t weightField = 0; /// --w|weight-field - Field holding the weight 208 bool srsWithReplacement = false; /// --r|replace 209 bool staticSeed = false; /// --s|static-seed 210 uint seedValueOptionArg = 0; /// --v|seed-value 211 bool printRandom = false; /// --print-random 212 bool genRandomInorder = false; /// --gen-random-inorder 213 string randomValueHeader = "random_value"; /// --random-value-header 214 bool compatibilityMode = false; /// --compatibility-mode 215 char delim = '\t'; /// --d|delimiter 216 bool versionWanted = false; /// --V|version 217 bool preferSkipSampling = false; /// --prefer-skip-sampling 218 bool preferAlgorithmR = false; /// --prefer-algorithm-r 219 bool hasWeightField = false; /// Derived. 220 bool useBernoulliSampling = false; /// Derived. 221 bool useDistinctSampling = false; /// Derived. 222 bool distinctKeyIsFullLine = false; /// Derived. True if '--k|key-fields 0' is specfied. 223 bool usingUnpredictableSeed = true; /// Derived from --static-seed, --seed-value 224 uint seed = 0; /// Derived from --static-seed, --seed-value 225 226 /** Process tsv-sample command line arguments. 227 * 228 * Defines the command line options, performs validation, and derives additional 229 * state. std.getopt.getopt is called to do the main option processing followed 230 * additional validation and derivation. 231 * 232 * Help text is printed to standard output if help was requested. Error text is 233 * written to stderr if invalid input is encountered. 234 * 235 * A tuple is returned. First value is true if command line arguments were 236 * successfully processed and execution should continue, or false if an error 237 * occurred or the user asked for help. If false, the second value is the 238 * appropriate exit code (0 or 1). 239 * 240 * Returning true (execution continues) means args have been validated and derived 241 * values calculated. Field indices will have been converted to zero-based. 242 */ 243 auto processArgs(ref string[] cmdArgs) 244 { 245 import std.algorithm : any, canFind, each; 246 import std.getopt; 247 import std.math : isNaN; 248 import std.path : baseName, stripExtension; 249 import std.typecons : Yes, No; 250 import tsv_utils.common.utils : makeFieldListOptionHandler; 251 252 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 253 254 try 255 { 256 arraySep = ","; // Use comma to separate values in command line options 257 auto r = getopt( 258 cmdArgs, 259 "help-verbose", " Print more detailed help.", &helpVerbose, 260 261 std.getopt.config.caseSensitive, 262 "H|header", " Treat the first line of each file as a header.", &hasHeader, 263 std.getopt.config.caseInsensitive, 264 265 "n|num", "NUM Maximum number of lines to output. All selected lines are output if not provided or zero.", &sampleSize, 266 "p|prob", "NUM Inclusion probability (0.0 < NUM <= 1.0). For Bernoulli sampling, the probability each line is selected output. For distinct sampling, the probability each unique key is selected for output.", &inclusionProbability, 267 268 "k|key-fields", "<field-list> Fields to use as key for distinct sampling. Use with '--p|prob'. Specify '--k|key-fields 0' to use the entire line as the key.", 269 keyFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero), 270 271 "w|weight-field", "NUM Field containing weights. All lines get equal weight if not provided or zero.", &weightField, 272 "r|replace", " Simple random sampling with replacement. Use --n|num to specify the sample size.", &srsWithReplacement, 273 "s|static-seed", " Use the same random seed every run.", &staticSeed, 274 275 std.getopt.config.caseSensitive, 276 "v|seed-value", "NUM Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg, 277 std.getopt.config.caseInsensitive, 278 279 "print-random", " Include the assigned random value (prepended) when writing output lines.", &printRandom, 280 "gen-random-inorder", " Output all lines with assigned random values prepended, no changes to the order of input.", &genRandomInorder, 281 "random-value-header", " Header to use with --print-random and --gen-random-inorder. Default: 'random_value'.", &randomValueHeader, 282 "compatibility-mode", " Turns on 'compatibility-mode'. Use --help-verbose for information.", &compatibilityMode, 283 284 "d|delimiter", "CHR Field delimiter.", &delim, 285 286 std.getopt.config.caseSensitive, 287 "V|version", " Print version information and exit.", &versionWanted, 288 std.getopt.config.caseInsensitive, 289 290 "prefer-skip-sampling", " (Internal) Prefer the skip-sampling algorithm for Bernoulli sampling. Used for testing and diagnostics.", 291 &preferSkipSampling, 292 293 "prefer-algorithm-r", " (Internal) Prefer Algorithm R for unweighted line order randomization. Used for testing and diagnostics.", 294 &preferAlgorithmR, 295 ); 296 297 if (r.helpWanted) 298 { 299 defaultGetoptPrinter(helpText, r.options); 300 return tuple(false, 0); 301 } 302 else if (helpVerbose) 303 { 304 defaultGetoptPrinter(helpTextVerbose, r.options); 305 return tuple(false, 0); 306 } 307 else if (versionWanted) 308 { 309 import tsv_utils.common.tsvutils_version; 310 writeln(tsvutilsVersionNotice("tsv-sample")); 311 return tuple(false, 0); 312 } 313 314 /* Derivations and validations. */ 315 if (weightField > 0) 316 { 317 hasWeightField = true; 318 weightField--; // Switch to zero-based indexes. 319 } 320 321 if (srsWithReplacement) 322 { 323 if (hasWeightField) 324 { 325 throw new Exception("Sampling with replacement (--r|replace) does not support weights (--w|weight-field)."); 326 } 327 else if (!inclusionProbability.isNaN) 328 { 329 throw new Exception("Sampling with replacement (--r|replace) cannot be used with probabilities (--p|prob)."); 330 } 331 else if (keyFields.length > 0) 332 { 333 throw new Exception("Sampling with replacement (--r|replace) cannot be used with distinct sampling (--k|key-fields)."); 334 } 335 else if (printRandom || genRandomInorder) 336 { 337 throw new Exception("Sampling with replacement (--r|replace) does not support random value printing (--print-random, --gen-random-inorder)."); 338 } 339 } 340 341 if (keyFields.length > 0) 342 { 343 /* Note: useDistinctSampling is set as part of the inclusion probability checks below. */ 344 345 if (inclusionProbability.isNaN) throw new Exception("--p|prob is required when using --k|key-fields."); 346 347 if (keyFields.length == 1 && keyFields[0] == 0) 348 { 349 distinctKeyIsFullLine = true; 350 } 351 else 352 { 353 if (keyFields.length > 1 && keyFields.any!(x => x == 0)) 354 { 355 throw new Exception("Whole line as key (--k|key-fields 0) cannot be combined with multiple fields."); 356 } 357 358 keyFields.each!((ref x) => --x); // Convert to zero-based indexing. 359 } 360 } 361 362 /* Inclusion probability (--p|prob) is used for both Bernoulli sampling and distinct sampling. */ 363 if (!inclusionProbability.isNaN) 364 { 365 if (inclusionProbability <= 0.0 || inclusionProbability > 1.0) 366 { 367 import std.format : format; 368 throw new Exception( 369 format("Invalid --p|prob option: %g. Must satisfy 0.0 < prob <= 1.0.", inclusionProbability)); 370 } 371 372 if (keyFields.length > 0) useDistinctSampling = true; 373 else useBernoulliSampling = true; 374 375 if (hasWeightField) throw new Exception("--w|weight-field and --p|prob cannot be used together."); 376 377 if (genRandomInorder && !useDistinctSampling) 378 { 379 throw new Exception("--q|gen-random-inorder and --p|prob can only be used together if --k|key-fields is also used."); 380 } 381 } 382 else if (genRandomInorder && !hasWeightField) 383 { 384 useBernoulliSampling = true; 385 } 386 387 if (randomValueHeader.length == 0 || randomValueHeader.canFind('\n') || 388 randomValueHeader.canFind(delim)) 389 { 390 throw new Exception("--randomValueHeader must be at least one character and not contain field delimiters or newlines."); 391 } 392 393 /* Random value printing implies compatibility-mode, otherwise user's selection is used. */ 394 if (printRandom || genRandomInorder) compatibilityMode = true; 395 396 /* Seed. */ 397 import std.random : unpredictableSeed; 398 399 usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0); 400 401 if (usingUnpredictableSeed) seed = unpredictableSeed; 402 else if (seedValueOptionArg != 0) seed = seedValueOptionArg; 403 else if (staticSeed) seed = 2438424139; 404 else assert(0, "Internal error, invalid seed option states."); 405 406 /* Assume remaining args are files. Use standard input if files were not provided. */ 407 files ~= (cmdArgs.length > 1) ? cmdArgs[1..$] : ["-"]; 408 cmdArgs.length = 1; 409 } 410 catch (Exception exc) 411 { 412 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 413 return tuple(false, 1); 414 } 415 return tuple(true, 0); 416 } 417 } 418 /** Invokes the appropriate sampling routine based on the command line arguments. 419 * 420 * tsvSample is the top-level routine handling the different tsv-sample use cases. 421 * Its primary role is to invoke the correct routine for type of sampling requested. 422 */ 423 void tsvSample(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 424 if (isOutputRange!(OutputRange, char)) 425 { 426 if (cmdopt.srsWithReplacement) 427 { 428 simpleRandomSamplingWithReplacement(cmdopt, outputStream); 429 } 430 else if (cmdopt.useBernoulliSampling) 431 { 432 bernoulliSamplingCommand(cmdopt, outputStream); 433 } 434 else if (cmdopt.useDistinctSampling) 435 { 436 if (cmdopt.genRandomInorder) distinctSampling!(Yes.generateRandomAll)(cmdopt, outputStream); 437 else distinctSampling!(No.generateRandomAll)(cmdopt, outputStream); 438 } 439 else if (cmdopt.genRandomInorder) 440 { 441 /* Note that the preceding cases handle gen-random-inorder themselves (Bernoulli, 442 * Distinct), or don't handle it (SRS w/ Replacement). 443 */ 444 assert(cmdopt.hasWeightField); 445 generateWeightedRandomValuesInorder(cmdopt, outputStream); 446 } 447 else if (cmdopt.sampleSize != 0) 448 { 449 reservoirSamplingCommand(cmdopt, outputStream); 450 } 451 else 452 { 453 randomizeLinesCommand(cmdopt, outputStream); 454 } 455 } 456 457 /** Invokes the appropriate Bernoulli sampling routine based on the command line 458 * arguments. 459 * 460 * This routine selects the appropriate Bernoulli sampling function and template 461 * instantiation to use based on the command line arguments. 462 * 463 * One of the basic choices is whether to use the vanilla algorithm or skip sampling. 464 * Skip sampling is a tad faster when the inclusion probability is small but doesn't 465 * support compatibility mode. See the bernoulliSkipSampling documentation for a 466 * discussion of the skipSamplingProbabilityThreshold used here. 467 */ 468 void bernoulliSamplingCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 469 if (isOutputRange!(OutputRange, char)) 470 { 471 assert(!cmdopt.hasWeightField); 472 473 immutable double skipSamplingProbabilityThreshold = 0.04; 474 475 if (cmdopt.compatibilityMode || 476 (cmdopt.inclusionProbability > skipSamplingProbabilityThreshold && !cmdopt.preferSkipSampling)) 477 { 478 if (cmdopt.genRandomInorder) 479 { 480 bernoulliSampling!(Yes.generateRandomAll)(cmdopt, outputStream); 481 } 482 else 483 { 484 bernoulliSampling!(No.generateRandomAll)(cmdopt, outputStream); 485 } 486 } 487 else 488 { 489 bernoulliSkipSampling(cmdopt, outputStream); 490 } 491 } 492 493 /** Bernoulli sampling of lines from the input stream. 494 * 495 * Each input line is a assigned a random value and output if less than 496 * cmdopt.inclusionProbability. The order of the lines is not changed. 497 * 498 * This routine supports random value printing and gen-random-inorder value printing. 499 */ 500 void bernoulliSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange) 501 (TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 502 if (isOutputRange!(OutputRange, char)) 503 { 504 import std.random : Random = Mt19937, uniform01; 505 import tsv_utils.common.utils : bufferedByLine, throwIfWindowsNewlineOnUnix; 506 507 static if (generateRandomAll) assert(cmdopt.genRandomInorder); 508 else assert(!cmdopt.genRandomInorder); 509 510 auto randomGenerator = Random(cmdopt.seed); 511 512 /* Process each line. */ 513 bool headerWritten = false; 514 size_t numLinesWritten = 0; 515 foreach (filename; cmdopt.files) 516 { 517 auto inputStream = (filename == "-") ? stdin : filename.File(); 518 foreach (fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1)) 519 { 520 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 521 if (fileLineNum == 1 && cmdopt.hasHeader) 522 { 523 if (!headerWritten) 524 { 525 static if (generateRandomAll) 526 { 527 outputStream.put(cmdopt.randomValueHeader); 528 outputStream.put(cmdopt.delim); 529 } 530 else if (cmdopt.printRandom) 531 { 532 outputStream.put(cmdopt.randomValueHeader); 533 outputStream.put(cmdopt.delim); 534 } 535 536 outputStream.put(line); 537 outputStream.put("\n"); 538 headerWritten = true; 539 } 540 } 541 else 542 { 543 immutable double lineScore = uniform01(randomGenerator); 544 545 static if (generateRandomAll) 546 { 547 outputStream.formatRandomValue(lineScore); 548 outputStream.put(cmdopt.delim); 549 outputStream.put(line); 550 outputStream.put("\n"); 551 552 if (cmdopt.sampleSize != 0) 553 { 554 ++numLinesWritten; 555 if (numLinesWritten == cmdopt.sampleSize) return; 556 } 557 } 558 else if (lineScore < cmdopt.inclusionProbability) 559 { 560 if (cmdopt.printRandom) 561 { 562 outputStream.formatRandomValue(lineScore); 563 outputStream.put(cmdopt.delim); 564 } 565 outputStream.put(line); 566 outputStream.put("\n"); 567 568 if (cmdopt.sampleSize != 0) 569 { 570 ++numLinesWritten; 571 if (numLinesWritten == cmdopt.sampleSize) return; 572 } 573 } 574 } 575 } 576 } 577 } 578 579 /** bernoulliSkipSampling is an implementation of Bernoulli sampling using skips. 580 * 581 * Skip sampling works by skipping a random number of lines between selections. This 582 * can be faster than assigning a random value to each line when the inclusion 583 * probability is low, as it reduces the number of calls to the random number 584 * generator. Both the random number generator and the log() function are called when 585 * calculating the next skip size. These additional log() calls add up as the 586 * inclusion probability increases. 587 * 588 * Performance tests indicate the break-even point is about 4-5% (--prob 0.04) for 589 * file-oriented line sampling. This is obviously environment specific. In the 590 * environments this implementation has been tested in the performance improvements 591 * remain small, less than 7%, even with an inclusion probability as low as 0.0001. 592 * 593 * The algorithm does not assign random values to individual lines. This makes it 594 * incompatible with random value printing. It is not suitable for compatibility mode 595 * either. As an example, in compatibility mode a line selected with '--prob 0.2' should 596 * also be selected with '--prob 0.3' (assuming the same random seed). Skip sampling 597 * does not have this property. 598 * 599 * The algorithm for calculating the skip size has been described by multiple sources. 600 * There are two key variants depending on whether the total number of lines in the 601 * data set is known in advance. (This implementation does not know the total.) 602 * Useful references: 603 * $(LIST 604 * * Jeffrey Scott Vitter, "An Efficient Algorithm for Sequential Random Sampling", 605 * ACM Trans on Mathematical Software, 1987. On-line: 606 * http://www.ittc.ku.edu/~jsv/Papers/Vit87.RandomSampling.pdf 607 * * P.J. Haas, "Data-Stream Sampling: Basic Techniques and Results", from the book 608 * "Data Stream Management", Springer-Verlag, 2016. On-line: 609 * https://www.springer.com/cda/content/document/cda_downloaddocument/9783540286073-c2.pdf 610 * * Erik Erlandson, "Faster Random Samples With Gap Sampling", 2014. On-line: 611 * http://erikerlandson.github.io/blog/2014/09/11/faster-random-samples-with-gap-sampling/ 612 * ) 613 */ 614 void bernoulliSkipSampling(OutputRange)(TsvSampleOptions cmdopt, OutputRange outputStream) 615 if (isOutputRange!(OutputRange, char)) 616 { 617 import std.conv : to; 618 import std.math : log, trunc; 619 import std.random : Random = Mt19937, uniform01; 620 import tsv_utils.common.utils : bufferedByLine, throwIfWindowsNewlineOnUnix; 621 622 assert(cmdopt.inclusionProbability > 0.0 && cmdopt.inclusionProbability < 1.0); 623 assert(!cmdopt.printRandom); 624 assert(!cmdopt.compatibilityMode); 625 626 auto randomGenerator = Random(cmdopt.seed); 627 628 immutable double discardRate = 1.0 - cmdopt.inclusionProbability; 629 immutable double logDiscardRate = log(discardRate); 630 631 /* Note: The '1.0 - uniform01(randomGenerator)' expression flips the half closed 632 * interval to (0.0, 1.0], excluding 0.0. 633 */ 634 size_t remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t; 635 636 /* Process each line. */ 637 bool headerWritten = false; 638 size_t numLinesWritten = 0; 639 foreach (filename; cmdopt.files) 640 { 641 auto inputStream = (filename == "-") ? stdin : filename.File(); 642 foreach (fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1)) 643 { 644 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 645 if (fileLineNum == 1 && cmdopt.hasHeader) 646 { 647 if (!headerWritten) 648 { 649 outputStream.put(line); 650 outputStream.put("\n"); 651 headerWritten = true; 652 } 653 } 654 else if (remainingSkips > 0) 655 { 656 --remainingSkips; 657 } 658 else 659 { 660 outputStream.put(line); 661 outputStream.put("\n"); 662 663 if (cmdopt.sampleSize != 0) 664 { 665 ++numLinesWritten; 666 if (numLinesWritten == cmdopt.sampleSize) return; 667 } 668 669 remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t; 670 } 671 } 672 } 673 } 674 675 /** Sample a subset of lines by choosing a random set of values from key fields. 676 * 677 * Distinct sampling is a streaming form of sampling, similar to Bernoulli sampling. 678 * However, instead of each line being subject to an independent trial, lines are 679 * selected based on a key from each line. A portion of keys are randomly selected for 680 * output, and every line containing a selected key is included in the output. 681 * 682 * An example use-case is a query log having <user, query, clicked-url> triples. It is 683 * often useful to sample records for portion of the users, but including all records 684 * for the users selected. Distinct sampling supports this by selecting the subset of 685 * users included in the output. 686 * 687 * Distinct sampling is done by hashing the key and mapping the hash value into 688 * buckets matching the inclusion probability. Records having a key mapping to bucket 689 * zero are output. 690 */ 691 void distinctSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange) 692 (TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 693 if (isOutputRange!(OutputRange, char)) 694 { 695 import std.algorithm : splitter; 696 import std.conv : to; 697 import std.digest.murmurhash; 698 import std.math : lrint; 699 import tsv_utils.common.utils : bufferedByLine, InputFieldReordering, throwIfWindowsNewlineOnUnix; 700 701 static if (generateRandomAll) assert(cmdopt.genRandomInorder); 702 else assert(!cmdopt.genRandomInorder); 703 704 assert(cmdopt.keyFields.length > 0); 705 assert(0.0 < cmdopt.inclusionProbability && cmdopt.inclusionProbability <= 1.0); 706 707 static if (generateRandomAll) 708 { 709 import std.format : formatValue, singleSpec; 710 immutable randomValueFormatSpec = singleSpec("%d"); 711 } 712 713 immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys. 714 715 uint numBuckets = (1.0 / cmdopt.inclusionProbability).lrint.to!uint; 716 717 /* Create a mapping for the key fields. */ 718 auto keyFieldsReordering = cmdopt.distinctKeyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields); 719 720 /* Process each line. */ 721 bool headerWritten = false; 722 size_t numLinesWritten = 0; 723 foreach (filename; cmdopt.files) 724 { 725 auto inputStream = (filename == "-") ? stdin : filename.File(); 726 foreach (fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1)) 727 { 728 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 729 if (fileLineNum == 1 && cmdopt.hasHeader) 730 { 731 if (!headerWritten) 732 { 733 static if (generateRandomAll) 734 { 735 outputStream.put(cmdopt.randomValueHeader); 736 outputStream.put(cmdopt.delim); 737 } 738 else if (cmdopt.printRandom) 739 { 740 outputStream.put(cmdopt.randomValueHeader); 741 outputStream.put(cmdopt.delim); 742 } 743 744 outputStream.put(line); 745 outputStream.put("\n"); 746 headerWritten = true; 747 } 748 } 749 else 750 { 751 /* Murmurhash works by successively adding individual keys, then finalizing. 752 * Adding individual keys is simpler if the full-line-as-key and individual 753 * fields as keys cases are separated. 754 */ 755 auto hasher = MurmurHash3!32(cmdopt.seed); 756 757 if (cmdopt.distinctKeyIsFullLine) 758 { 759 hasher.put(cast(ubyte[]) line); 760 } 761 else 762 { 763 assert(keyFieldsReordering !is null); 764 765 /* Gather the key field values and assemble the key. */ 766 keyFieldsReordering.initNewLine; 767 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 768 { 769 keyFieldsReordering.processNextField(fieldIndex, fieldValue); 770 if (keyFieldsReordering.allFieldsFilled) break; 771 } 772 773 if (!keyFieldsReordering.allFieldsFilled) 774 { 775 import std.format : format; 776 throw new Exception( 777 format("Not enough fields in line. File: %s, Line: %s", 778 (filename == "-") ? "Standard Input" : filename, fileLineNum)); 779 } 780 781 foreach (count, key; keyFieldsReordering.outputFields.enumerate) 782 { 783 if (count > 0) hasher.put(delimArray); 784 hasher.put(cast(ubyte[]) key); 785 } 786 } 787 788 hasher.finish; 789 790 static if (generateRandomAll) 791 { 792 import std.conv : to; 793 outputStream.formatValue(hasher.get % numBuckets, randomValueFormatSpec); 794 outputStream.put(cmdopt.delim); 795 outputStream.put(line); 796 outputStream.put("\n"); 797 798 if (cmdopt.sampleSize != 0) 799 { 800 ++numLinesWritten; 801 if (numLinesWritten == cmdopt.sampleSize) return; 802 } 803 } 804 else if (hasher.get % numBuckets == 0) 805 { 806 if (cmdopt.printRandom) 807 { 808 outputStream.put('0'); 809 outputStream.put(cmdopt.delim); 810 } 811 outputStream.put(line); 812 outputStream.put("\n"); 813 814 if (cmdopt.sampleSize != 0) 815 { 816 ++numLinesWritten; 817 if (numLinesWritten == cmdopt.sampleSize) return; 818 } 819 } 820 } 821 } 822 } 823 } 824 825 /** Invokes the appropriate reservoir sampling routine based on the command line 826 * arguments. 827 * 828 * This routine selects the appropriate reservoir sampling function and template 829 * instantiation to use based on the command line arguments. 830 * 831 * Reservoir sampling is used when a fixed size sample is being selected from an 832 * input stream. Weighted and unweighted sampling is supported. These routines also 833 * randomize the order of the selected lines. This is consistent with line order 834 * randomization of the entire input stream (handled by randomizeLinesCommand). 835 * 836 * For unweighted sampling there is a performance tradeoff between the two available 837 * implementations. Heap-based sampling is faster for small sample sizes, Algorithm R 838 * is faster for large sample sizes. The threshold used here was chosen based on 839 * performance tests. See the reservoirSamplingAlgorithmR documentation for more 840 * information. 841 */ 842 843 void reservoirSamplingCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 844 if (isOutputRange!(OutputRange, char)) 845 { 846 assert(cmdopt.sampleSize != 0); 847 848 immutable size_t algorithmRSampleSizeThreshold = 128 * 1024; 849 850 if (cmdopt.hasWeightField) 851 { 852 reservoirSamplingViaHeap!(Yes.isWeighted)(cmdopt, outputStream); 853 } 854 else if (cmdopt.compatibilityMode || 855 (cmdopt.sampleSize < algorithmRSampleSizeThreshold && !cmdopt.preferAlgorithmR)) 856 { 857 reservoirSamplingViaHeap!(No.isWeighted)(cmdopt, outputStream); 858 } 859 else 860 { 861 reservoirSamplingAlgorithmR(cmdopt, outputStream); 862 } 863 } 864 865 /** Reservoir sampling using a heap. Both weighted and unweighted random sampling are 866 * supported. 867 * 868 * The algorithm used here is based on the one-pass algorithm described by Pavlos 869 * Efraimidis and Paul Spirakis ("Weighted Random Sampling over Data Streams", Pavlos S. 870 * Efraimidis, https://arxiv.org/abs/1012.0256). In the unweighted case weights are 871 * simply set to one. 872 * 873 * The implementation uses a heap (priority queue) large enough to hold the desired 874 * number of lines. Input is read line-by-line, assigned a random value, and added to 875 * the heap. The role of the heap is to identify the lines with the highest assigned 876 * random values. Once the heap is full, adding a new line means dropping the line 877 * with the lowest score. A "min" heap used for this reason. 878 * 879 * When done reading all lines, the "min" heap is in the opposite order needed for 880 * output. The desired order is obtained by removing each element one at at time from 881 * the heap. The underlying data store will have the elements in correct order. 882 * 883 * Generating output in weighted order matters for several reasons: 884 * - For weighted sampling, it preserves the property that smaller valid subsets can be 885 * created by taking the first N lines. 886 * - For unweighted sampling, it ensures that all output permutations are possible, and 887 * are not influences by input order or the heap data structure used. 888 * - Order consistency is maintained when making repeated use of the same random seed, 889 * but with different sample sizes. 890 * 891 * There are use cases where only the selection set matters, for these some performance 892 * could be gained by skipping the reordering and simply printing the backing store 893 * array in-order, but making this distinction seems an unnecessary complication. 894 * 895 * Notes: 896 * $(LIST 897 * * In tsv-sample versions 1.2.1 and earlier this routine also supported 898 * randomization of all input lines. This was dropped in version 1.2.2 in favor 899 * of the approach used in randomizeLines. The latter has significant advantages 900 * given that all data must be read into memory. 901 * * For large reservoir sizes better performance can be achieved using Algorithm R. 902 * See the reservoirSamplingAlgorithmR documentation for details. 903 * ) 904 */ 905 void reservoirSamplingViaHeap(Flag!"isWeighted" isWeighted, OutputRange) 906 (TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 907 if (isOutputRange!(OutputRange, char)) 908 { 909 import std.container.array; 910 import std.container.binaryheap; 911 import std.random : Random = Mt19937, uniform01; 912 import tsv_utils.common.utils : bufferedByLine, throwIfWindowsNewlineOnUnix; 913 914 static if (isWeighted) assert(cmdopt.hasWeightField); 915 else assert(!cmdopt.hasWeightField); 916 917 assert(cmdopt.sampleSize > 0); 918 919 auto randomGenerator = Random(cmdopt.seed); 920 921 struct Entry 922 { 923 double score; 924 char[] line; 925 } 926 927 /* Create the heap and backing data store. 928 * 929 * Note: An std.container.array is used as the backing store to avoid some issues in 930 * the standard library (Phobos) binaryheap implementation. Specifically, when an 931 * std.container.array is used as backing store, the heap can efficiently reversed by 932 * removing the heap elements. This leaves the backing store in the reversed order. 933 * However, the current binaryheap implementation does not support this for all 934 * backing stores. See: https://issues.dlang.org/show_bug.cgi?id=17094. 935 */ 936 937 Array!Entry dataStore; 938 dataStore.reserve(cmdopt.sampleSize); 939 auto reservoir = dataStore.heapify!("a.score > b.score")(0); // Min binaryheap 940 941 /* Process each line. */ 942 bool headerWritten = false; 943 foreach (filename; cmdopt.files) 944 { 945 auto inputStream = (filename == "-") ? stdin : filename.File(); 946 foreach (fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1)) 947 { 948 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 949 if (fileLineNum == 1 && cmdopt.hasHeader) 950 { 951 if (!headerWritten) 952 { 953 if (cmdopt.printRandom) 954 { 955 outputStream.put(cmdopt.randomValueHeader); 956 outputStream.put(cmdopt.delim); 957 } 958 outputStream.put(line); 959 outputStream.put("\n"); 960 headerWritten = true; 961 } 962 } 963 else 964 { 965 static if (!isWeighted) 966 { 967 immutable double lineScore = uniform01(randomGenerator); 968 } 969 else 970 { 971 immutable double lineWeight = 972 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, filename, fileLineNum); 973 immutable double lineScore = 974 (lineWeight > 0.0) 975 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 976 : 0.0; 977 } 978 979 if (reservoir.length < cmdopt.sampleSize) 980 { 981 reservoir.insert(Entry(lineScore, line.dup)); 982 } 983 else if (reservoir.front.score < lineScore) 984 { 985 reservoir.replaceFront(Entry(lineScore, line.dup)); 986 } 987 } 988 } 989 } 990 991 /* All entries are in the reservoir. Time to print. The heap is in reverse order 992 * of assigned weights. Reversing order is done by removing all elements from the 993 * heap, this leaves the backing store in the correct order for output. 994 * 995 * The asserts here avoid issues with the current binaryheap implementation. They 996 * detect use of backing stores having a length not synchronized to the reservoir. 997 */ 998 immutable size_t numLines = reservoir.length; 999 assert(numLines == dataStore.length); 1000 1001 while (!reservoir.empty) reservoir.removeFront; 1002 assert(numLines == dataStore.length); 1003 1004 foreach (entry; dataStore) 1005 { 1006 if (cmdopt.printRandom) 1007 { 1008 outputStream.formatRandomValue(entry.score); 1009 outputStream.put(cmdopt.delim); 1010 } 1011 outputStream.put(entry.line); 1012 outputStream.put("\n"); 1013 } 1014 } 1015 1016 /** Generates weighted random values for all input lines, preserving input order. 1017 * 1018 * This complements weighted reservoir sampling, but instead of using a reservoir it 1019 * simply iterates over the input lines generating the values. The weighted random 1020 * values are generated with the same formula used by reservoirSampling. 1021 */ 1022 void generateWeightedRandomValuesInorder(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1023 if (isOutputRange!(OutputRange, char)) 1024 { 1025 import std.random : Random = Mt19937, uniform01; 1026 import tsv_utils.common.utils : bufferedByLine, throwIfWindowsNewlineOnUnix; 1027 1028 assert(cmdopt.hasWeightField); 1029 1030 auto randomGenerator = Random(cmdopt.seed); 1031 1032 /* Process each line. */ 1033 bool headerWritten = false; 1034 size_t numLinesWritten = 0; 1035 foreach (filename; cmdopt.files) 1036 { 1037 auto inputStream = (filename == "-") ? stdin : filename.File(); 1038 foreach (fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1)) 1039 { 1040 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 1041 if (fileLineNum == 1 && cmdopt.hasHeader) 1042 { 1043 if (!headerWritten) 1044 { 1045 outputStream.put(cmdopt.randomValueHeader); 1046 outputStream.put(cmdopt.delim); 1047 outputStream.put(line); 1048 outputStream.put("\n"); 1049 headerWritten = true; 1050 } 1051 } 1052 else 1053 { 1054 immutable double lineWeight = 1055 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, filename, fileLineNum); 1056 1057 immutable double lineScore = 1058 (lineWeight > 0.0) 1059 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 1060 : 0.0; 1061 1062 outputStream.formatRandomValue(lineScore); 1063 outputStream.put(cmdopt.delim); 1064 outputStream.put(line); 1065 outputStream.put("\n"); 1066 1067 if (cmdopt.sampleSize != 0) 1068 { 1069 ++numLinesWritten; 1070 if (numLinesWritten == cmdopt.sampleSize) return; 1071 } 1072 } 1073 } 1074 } 1075 } 1076 1077 /** Reservoir sampling via Algorithm R 1078 * 1079 * This is an implementation of reservoir sampling using what is commonly known as 1080 * "Algorithm R", credited to Alan Waterman by Donald Knuth in the "The Art of 1081 * Computer Programming, Volume 2: Seminumerical Algorithms". More information about 1082 * the algorithm can be found in Jeffrey Vitter's classic paper "Random Sampling with 1083 * a Reservoir" (1985) as well as the Wikipedia article "Reservoir Sampling" 1084 * (https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_R). 1085 * 1086 * Algorithm R is used for unweighted sampling without replacement. The heap-based 1087 * algorithm in reservoirSamplingViaHeap is used for weighted sampling. 1088 * 1089 * The classic algorithm stops after identifying the selected set of items. This 1090 * implementation goes one step further and randomizes the order of the selected 1091 * lines. This supports the tsv-sample use-case, which is line order randomization. 1092 * 1093 * This algorithm is faster than reservoirSamplingViaHeap when the sample size 1094 * (reservoir size) is large. Heap insertion is O(log k), where k is the sample size. 1095 * Insertion in this algorithm is O(1). Similarly, generating the random order in the 1096 * heap is O(k * log k), while in this algorithm the final randomization step is O(k). 1097 * 1098 * This speed advantage may be offset a certain amount by using a more expensive random 1099 * value generator. reservoirSamplingViaHeap generates values between zero and one, 1100 * whereas reservoirSamplingAlgorithmR generates random integers over and ever growing 1101 * interval. The latter is expected to be more expensive. This is consistent with 1102 * performance tests indicating that reservoirSamplingViaHeap is faster when using 1103 * small-to-medium size reservoirs and large input streams. 1104 */ 1105 void reservoirSamplingAlgorithmR(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1106 if (isOutputRange!(OutputRange, char)) 1107 { 1108 import std.random : Random = Mt19937, randomShuffle, uniform; 1109 import tsv_utils.common.utils : bufferedByLine, throwIfWindowsNewlineOnUnix; 1110 1111 assert(cmdopt.sampleSize > 0); 1112 assert(!cmdopt.hasWeightField); 1113 assert(!cmdopt.compatibilityMode); 1114 assert(!cmdopt.printRandom); 1115 assert(!cmdopt.genRandomInorder); 1116 1117 string[] reservoir; 1118 auto reservoirAppender = appender(&reservoir); 1119 reservoirAppender.reserve(cmdopt.sampleSize); 1120 1121 auto randomGenerator = Random(cmdopt.seed); 1122 1123 /* Process each line. */ 1124 1125 bool headerWritten = false; 1126 size_t totalLineNum = 0; 1127 foreach (filename; cmdopt.files) 1128 { 1129 auto inputStream = (filename == "-") ? stdin : filename.File(); 1130 foreach (fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1)) 1131 { 1132 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 1133 if (fileLineNum == 1 && cmdopt.hasHeader) 1134 { 1135 if (!headerWritten) 1136 { 1137 outputStream.put(line); 1138 outputStream.put("\n"); 1139 headerWritten = true; 1140 } 1141 } 1142 else 1143 { 1144 /* Add lines to the reservoir until the reservoir is filled. 1145 * After that lines are added with decreasing likelihood, based on 1146 * the total number of lines seen. If added to the reservoir, the 1147 * line replaces a randomly chosen existing line. 1148 */ 1149 if (totalLineNum < cmdopt.sampleSize) 1150 { 1151 reservoirAppender ~= line.idup; 1152 } 1153 else 1154 { 1155 immutable size_t i = uniform(0, totalLineNum, randomGenerator); 1156 if (i < reservoir.length) reservoir[i] = line.idup; 1157 } 1158 1159 ++totalLineNum; 1160 } 1161 } 1162 } 1163 1164 /* The random sample is now in the reservoir. Shuffle it and print. */ 1165 1166 reservoir.randomShuffle(randomGenerator); 1167 1168 foreach (ref line; reservoir) 1169 { 1170 outputStream.put(line); 1171 outputStream.put("\n"); 1172 } 1173 } 1174 1175 /** This routine is invoked when all input lines are being randomized. It selects the 1176 * appropriate function and template instantiation based on the command line arguments. 1177 * 1178 * Different randomization algorithms are used when all input lines are being randomized 1179 * rather than a subset. The key distinction being that if all input needs to be read 1180 * into memory to support the algorithm, it works better to simply read the data all at 1181 * once. 1182 * 1183 * There are two different types of algorithms used. Array shuffling is used for 1184 * unweighted randomization. Sorting is used for weighted randomization or when 1185 * compatibility mode is needed. 1186 * 1187 * The algorithms used here are all limited by available memory. 1188 */ 1189 void randomizeLinesCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1190 if (isOutputRange!(OutputRange, char)) 1191 { 1192 if (cmdopt.hasWeightField) 1193 { 1194 randomizeLinesViaSort!(Yes.isWeighted)(cmdopt, outputStream); 1195 } 1196 else if (cmdopt.compatibilityMode) 1197 { 1198 randomizeLinesViaSort!(No.isWeighted)(cmdopt, outputStream); 1199 } 1200 else 1201 { 1202 randomizeLinesViaShuffle(cmdopt, outputStream); 1203 } 1204 } 1205 1206 /** Randomize all the lines in files or standard input using assigned random weights 1207 * and sorting. 1208 * 1209 * All lines in files and/or standard input are read in and written out in random 1210 * order. This algorithm assigns a random value to each line and sorts. This approach 1211 * supports both weighted sampling and simple random sampling (unweighted). 1212 * 1213 * This is significantly faster than heap-based reservoir sampling in the case where 1214 * the entire file is being read. See also randomizeLinesViaShuffle for the unweighted 1215 * case, as it is a little faster, at the cost not supporting random value printing or 1216 * compatibility-mode. 1217 * 1218 * Input data size is limited by available memory. Disk oriented techniques are needed 1219 * when data sizes are larger. For example, generating random values line-by-line (ala 1220 * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort. 1221 */ 1222 void randomizeLinesViaSort(Flag!"isWeighted" isWeighted, OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1223 if (isOutputRange!(OutputRange, char)) 1224 { 1225 import std.algorithm : map, sort; 1226 1227 static if (isWeighted) assert(cmdopt.hasWeightField); 1228 else assert(!cmdopt.hasWeightField); 1229 1230 assert(cmdopt.sampleSize == 0); 1231 1232 /* 1233 * Read all file data into memory. Then split the data into lines and assign a 1234 * random value to each line. identifyFileLines also writes the first header line. 1235 */ 1236 const fileData = cmdopt.files.map!FileData.array; 1237 auto inputLines = fileData.identifyFileLines!(Yes.hasRandomValue, isWeighted)(cmdopt, outputStream); 1238 1239 /* 1240 * Sort by the weight and output the lines. 1241 */ 1242 inputLines.sort!((a, b) => a.randomValue > b.randomValue); 1243 1244 foreach (lineEntry; inputLines) 1245 { 1246 if (cmdopt.printRandom) 1247 { 1248 outputStream.formatRandomValue(lineEntry.randomValue); 1249 outputStream.put(cmdopt.delim); 1250 } 1251 outputStream.put(lineEntry.data); 1252 outputStream.put("\n"); 1253 } 1254 } 1255 1256 /** Randomize all the lines in files or standard input using a shuffling algorithm. 1257 * 1258 * All lines in files and/or standard input are read in and written out in random 1259 * order. This routine uses array shuffling, which is faster than sorting. It is a 1260 * good alternative to randomizeLinesViaSort when doing unweighted randomization. 1261 * 1262 * Input data size is limited by available memory. Disk oriented techniques are needed 1263 * when data sizes are larger. For example, generating random values line-by-line (ala 1264 * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort. 1265 * 1266 * This routine does not support random value printing or compatibility-mode. 1267 */ 1268 void randomizeLinesViaShuffle(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1269 if (isOutputRange!(OutputRange, char)) 1270 { 1271 import std.algorithm : map; 1272 import std.random : Random = Mt19937, randomShuffle; 1273 1274 assert(cmdopt.sampleSize == 0); 1275 assert(!cmdopt.hasWeightField); 1276 assert(!cmdopt.printRandom); 1277 assert(!cmdopt.genRandomInorder); 1278 1279 /* 1280 * Read all file data into memory and split into lines. 1281 */ 1282 const fileData = cmdopt.files.map!FileData.array; 1283 auto inputLines = fileData.identifyFileLines!(No.hasRandomValue, No.isWeighted)(cmdopt, outputStream); 1284 1285 /* 1286 * Randomly shuffle and print each line. 1287 * 1288 * Note: Also tried randomCover, but that was exceedingly slow. 1289 */ 1290 import std.random : randomShuffle; 1291 1292 auto randomGenerator = Random(cmdopt.seed); 1293 inputLines.randomShuffle(randomGenerator); 1294 1295 foreach (ref line; inputLines) 1296 { 1297 outputStream.put(line.data); 1298 outputStream.put("\n"); 1299 } 1300 } 1301 1302 /** Simple random sampling with replacement. 1303 * 1304 * All lines in files and/or standard input are read in. Then random lines are selected 1305 * one at a time and output. Lines can be selected multiple times. This process continues 1306 * until the desired number of samples (--n|num) has been output. Output continues 1307 * indefinitely if a sample size was not provided. 1308 */ 1309 void simpleRandomSamplingWithReplacement(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1310 if (isOutputRange!(OutputRange, char)) 1311 { 1312 import std.algorithm : map; 1313 import std.random : Random = Mt19937, uniform; 1314 1315 /* 1316 * Read all file data into memory and split the data into lines. 1317 */ 1318 const fileData = cmdopt.files.map!FileData.array; 1319 const inputLines = fileData.identifyFileLines!(No.hasRandomValue, No.isWeighted)(cmdopt, outputStream); 1320 1321 if (inputLines.length > 0) 1322 { 1323 auto randomGenerator = Random(cmdopt.seed); 1324 1325 /* Repeat forever is sampleSize is zero, otherwise print sampleSize lines. */ 1326 size_t numLeft = (cmdopt.sampleSize == 0) ? 1 : cmdopt.sampleSize; 1327 while (numLeft != 0) 1328 { 1329 immutable size_t index = uniform(0, inputLines.length, randomGenerator); 1330 outputStream.put(inputLines[index].data); 1331 outputStream.put("\n"); 1332 if (cmdopt.sampleSize != 0) numLeft--; 1333 } 1334 } 1335 } 1336 1337 /** A container and reader of data from a file or standard input. 1338 * 1339 * The FileData struct is used to read data from a file or standard input. It is used 1340 * by passing a filename to the constructor. The constructor reads the file data. 1341 * If the filename is a single hyphen ('-') then data is read from standard input. 1342 * 1343 * The struct make the data available through two members: 'filename', which is the 1344 * filename, and 'data', which is a character array of the data. 1345 */ 1346 struct FileData 1347 { 1348 string filename; 1349 char[] data; 1350 1351 this(string fname) 1352 { 1353 import std.algorithm : min; 1354 import std.array : appender; 1355 1356 filename = fname; 1357 1358 ubyte[1024 * 128] fileRawBuf; 1359 auto dataAppender = appender(&data); 1360 auto ifile = (filename == "-") ? stdin : filename.File; 1361 1362 if (filename != "-") 1363 { 1364 immutable ulong filesize = ifile.size; 1365 if (filesize < ulong.max) dataAppender.reserve(min(filesize, size_t.max)); 1366 } 1367 1368 foreach (ref ubyte[] buffer; ifile.byChunk(fileRawBuf)) dataAppender.put(cast(char[]) buffer); 1369 } 1370 } 1371 1372 /** HasRandomValue is a boolean flag used at compile time by identifyFileLines to 1373 * distinguish use cases needing random value assignments from those that don't. 1374 */ 1375 alias HasRandomValue = Flag!"hasRandomValue"; 1376 1377 /** An InputLine array is returned by identifyFileLines to represent each non-header line 1378 * line found in a FileData array. The 'data' element contains the line. A 'randomValue' 1379 * line is included if random values are being generated. 1380 */ 1381 struct InputLine(HasRandomValue hasRandomValue) 1382 { 1383 const(char)[] data; 1384 static if (hasRandomValue) double randomValue; 1385 } 1386 1387 /** identifyFileLines is used by algorithms that read all files into memory prior to 1388 * processing. It does the initial processing of the file data. 1389 * 1390 * Three primary tasks are performed. One is splitting all input data into lines. The 1391 * second is writing the header line from the first file to the output stream. Header 1392 * lines from subsequent files are ignored. Third is assigning a random value to the 1393 * line, if random values are being generated. 1394 * 1395 * The key input is a FileData array, one element for each file. The FileData reads 1396 * the file when instantiated. 1397 * 1398 * The return value is an array of InputLine structs. The struct will have a 'randomValue' 1399 * member if random values are being assigned. 1400 */ 1401 InputLine!hasRandomValue[] identifyFileLines(HasRandomValue hasRandomValue, Flag!"isWeighted" isWeighted, OutputRange) 1402 (const ref FileData[] fileData, TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1403 if (isOutputRange!(OutputRange, char)) 1404 { 1405 import std.algorithm : splitter; 1406 import std.array : appender; 1407 import std.random : Random = Mt19937, uniform01; 1408 import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix; 1409 1410 static assert(hasRandomValue || !isWeighted); 1411 static if(!hasRandomValue) assert(!cmdopt.printRandom); 1412 1413 InputLine!hasRandomValue[] inputLines; 1414 1415 auto linesAppender = appender(&inputLines); 1416 static if (hasRandomValue) auto randomGenerator = Random(cmdopt.seed); 1417 bool headerWritten = false; 1418 1419 foreach (fd; fileData) 1420 { 1421 /* Drop the last newline to avoid adding an extra empty line. */ 1422 const data = (fd.data.length > 0 && fd.data[$ - 1] == '\n') ? fd.data[0 .. $ - 1] : fd.data; 1423 foreach (fileLineNum, ref line; data.splitter('\n').enumerate(1)) 1424 { 1425 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, fd.filename, fileLineNum); 1426 if (fileLineNum == 1 && cmdopt.hasHeader) 1427 { 1428 if (!headerWritten) 1429 { 1430 if (cmdopt.printRandom) 1431 { 1432 outputStream.put(cmdopt.randomValueHeader); 1433 outputStream.put(cmdopt.delim); 1434 } 1435 outputStream.put(line); 1436 outputStream.put("\n"); 1437 headerWritten = true; 1438 } 1439 } 1440 else 1441 { 1442 static if (!hasRandomValue) 1443 { 1444 linesAppender.put(InputLine!hasRandomValue(line)); 1445 } 1446 else 1447 { 1448 static if (!isWeighted) 1449 { 1450 immutable double randomValue = uniform01(randomGenerator); 1451 } 1452 else 1453 { 1454 immutable double lineWeight = 1455 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, 1456 fd.filename, fileLineNum); 1457 immutable double randomValue = 1458 (lineWeight > 0.0) 1459 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 1460 : 0.0; 1461 } 1462 1463 linesAppender.put(InputLine!hasRandomValue(line, randomValue)); 1464 } 1465 } 1466 } 1467 } 1468 1469 return inputLines; 1470 } 1471 1472 /** Write a floating point random value to an output stream. 1473 * 1474 * This routine is used for floating point random value printing. This routine writes 1475 * 17 significant digits, the range available in doubles. This routine prefers decimal 1476 * format, without exponents. It will generate somewhat large precision numbers, 1477 * currently up to 28 digits, before switching to exponents. 1478 * 1479 * The primary reason for this approach is to enable faster sorting on random values 1480 * by GNU sort and similar external sorting programs. GNU sort is dramatically faster 1481 * on decimal format numeric sorts ('n' switch) than general numeric sorts ('g' switch). 1482 * The 'general numeric' handles exponential notation. The difference is 5-10x. 1483 * 1484 * Random values generated by Bernoulli sampling are nearly always greater than 1e-12. 1485 * No examples less than 1e-09 were seen in hundred of millions of trials. Similar 1486 * results were seen with weighted sampling with integer weights. The same is not true 1487 * with floating point weights. These produce quite large exponents. However, even 1488 * for floating point weights this can be useful. For random weights [0,1] less than 5% 1489 * will be less than 1e-12 and use exponential notation. 1490 */ 1491 void formatRandomValue(OutputRange)(auto ref OutputRange outputStream, double value) 1492 if (isOutputRange!(OutputRange, char)) 1493 { 1494 import std.format : formatValue, singleSpec; 1495 1496 immutable spec17f = singleSpec("%.17f"); 1497 immutable spec18f = singleSpec("%.18f"); 1498 immutable spec19f = singleSpec("%.19f"); 1499 immutable spec20f = singleSpec("%.20f"); 1500 immutable spec21f = singleSpec("%.21f"); 1501 immutable spec22f = singleSpec("%.22f"); 1502 immutable spec23f = singleSpec("%.23f"); 1503 immutable spec24f = singleSpec("%.24f"); 1504 immutable spec25f = singleSpec("%.25f"); 1505 immutable spec26f = singleSpec("%.26f"); 1506 immutable spec27f = singleSpec("%.27f"); 1507 immutable spec28f = singleSpec("%.28f"); 1508 1509 immutable spec17g = singleSpec("%.17g"); 1510 1511 immutable formatSpec = 1512 (value >= 1e-01) ? spec17f : 1513 (value >= 1e-02) ? spec18f : 1514 (value >= 1e-03) ? spec19f : 1515 (value >= 1e-04) ? spec20f : 1516 (value >= 1e-05) ? spec21f : 1517 (value >= 1e-06) ? spec22f : 1518 (value >= 1e-07) ? spec23f : 1519 (value >= 1e-08) ? spec24f : 1520 (value >= 1e-09) ? spec25f : 1521 (value >= 1e-10) ? spec26f : 1522 (value >= 1e-11) ? spec27f : 1523 (value >= 1e-12) ? spec28f : spec17g; 1524 1525 outputStream.formatValue(value, formatSpec); 1526 } 1527 1528 unittest 1529 { 1530 void testFormatValue(double value, string expected) 1531 { 1532 import std.array : appender; 1533 import std.format : format; 1534 1535 auto s = appender!string(); 1536 s.formatRandomValue(value); 1537 assert(s.data == expected, 1538 format("[testFormatValue] value: %g; expected: %s; actual: %s", value, expected, s.data)); 1539 } 1540 1541 testFormatValue(1.0, "1.00000000000000000"); 1542 testFormatValue(0.1, "0.10000000000000001"); 1543 testFormatValue(0.01, "0.010000000000000000"); 1544 testFormatValue(1e-03, "0.0010000000000000000"); 1545 testFormatValue(1e-04, "0.00010000000000000000"); 1546 testFormatValue(1e-05, "0.000010000000000000001"); 1547 testFormatValue(1e-06, "0.0000010000000000000000"); 1548 testFormatValue(1e-07, "0.00000010000000000000000"); 1549 testFormatValue(1e-08, "0.000000010000000000000000"); 1550 testFormatValue(1e-09, "0.0000000010000000000000001"); 1551 testFormatValue(1e-10, "0.00000000010000000000000000"); 1552 testFormatValue(1e-11, "0.000000000009999999999999999"); 1553 testFormatValue(1e-12, "0.0000000000010000000000000000"); 1554 testFormatValue(1e-13, "1e-13"); 1555 testFormatValue(1e-14, "1e-14"); 1556 testFormatValue(12345678901234567e-15, "12.34567890123456735"); 1557 testFormatValue(12345678901234567e-16, "1.23456789012345669"); 1558 testFormatValue(12345678901234567e-17, "0.12345678901234566"); 1559 testFormatValue(12345678901234567e-18, "0.012345678901234567"); 1560 testFormatValue(12345678901234567e-19, "0.0012345678901234567"); 1561 testFormatValue(12345678901234567e-20, "0.00012345678901234567"); 1562 testFormatValue(12345678901234567e-21, "0.000012345678901234568"); 1563 testFormatValue(12345678901234567e-22, "0.0000012345678901234567"); 1564 testFormatValue(12345678901234567e-23, "0.00000012345678901234566"); 1565 testFormatValue(12345678901234567e-24, "0.000000012345678901234567"); 1566 testFormatValue(12345678901234567e-25, "0.0000000012345678901234566"); 1567 testFormatValue(12345678901234567e-26, "0.00000000012345678901234568"); 1568 testFormatValue(12345678901234567e-27, "0.000000000012345678901234567"); 1569 testFormatValue(12345678901234567e-28, "0.0000000000012345678901234567"); 1570 testFormatValue(12345678901234567e-29, "1.2345678901234566e-13"); 1571 } 1572 1573 1574 /** Convenience function for extracting a single field from a line. See 1575 * [tsv_utils.common.utils.getTsvFieldValue] for details. This wrapper creates error 1576 * text tailored for this program. 1577 */ 1578 import std.traits : isSomeChar; 1579 T getFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim, string filename, size_t lineNum) pure @safe 1580 if (isSomeChar!C) 1581 { 1582 import std.conv : ConvException, to; 1583 import std.format : format; 1584 import tsv_utils.common.utils : getTsvFieldValue; 1585 1586 T val; 1587 try 1588 { 1589 val = getTsvFieldValue!T(line, fieldIndex, delim); 1590 } 1591 catch (ConvException exc) 1592 { 1593 throw new Exception( 1594 format("Could not process line: %s\n File: %s Line: %s%s", 1595 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum, 1596 (lineNum == 1) ? "\n Is this a header line? Use --H|header to skip." : "")); 1597 } 1598 catch (Exception exc) 1599 { 1600 /* Not enough fields on the line. */ 1601 throw new Exception( 1602 format("Could not process line: %s\n File: %s Line: %s", 1603 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum)); 1604 } 1605 1606 return val; 1607 } 1608 1609 unittest 1610 { 1611 /* getFieldValue unit tests. getTsvFieldValue has it's own tests. 1612 * These tests make basic sanity checks on the getFieldValue wrapper. 1613 */ 1614 import std.exception; 1615 1616 assert(getFieldValue!double("123", 0, '\t', "unittest", 1) == 123); 1617 assert(getFieldValue!double("123.4", 0, '\t', "unittest", 1) == 123.4); 1618 assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 1)); 1619 assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 2)); 1620 assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 1)); 1621 assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 2)); 1622 } 1623 1624 /* Unit tests for the main program start here. 1625 * 1626 * Portability note: Many of the tests here rely on generating consistent random numbers 1627 * across different platforms when using the same random seed. So far this has succeeded 1628 * on several different platform, compiler, and library versions. However, it is certainly 1629 * possible this condition will not hold on other platforms. 1630 * 1631 * For tsv-sample, this portability implies generating the same results on different 1632 * platforms when using the same random seed. This is NOT part of tsv-sample guarantees, 1633 * but it is convenient for testing. If platforms are identified that do not generate 1634 * the same results these tests will need to be adjusted. 1635 */ 1636 version(unittest) 1637 { 1638 /* Unit test helper functions. */ 1639 1640 import tsv_utils.common.unittest_utils; // tsv unit test helpers, from common/src/. 1641 import std.conv : to; 1642 1643 void testTsvSample(string[] cmdArgs, string[][] expected) 1644 { 1645 import std.array : appender; 1646 import std.format : format; 1647 1648 assert(cmdArgs.length > 0, "[testTsvSample] cmdArgs must not be empty."); 1649 1650 auto formatAssertMessage(T...)(string msg, T formatArgs) 1651 { 1652 auto formatString = "[testTsvSample] %s: " ~ msg; 1653 return format(formatString, cmdArgs[0], formatArgs); 1654 } 1655 1656 TsvSampleOptions cmdopt; 1657 auto savedCmdArgs = cmdArgs.to!string; 1658 auto r = cmdopt.processArgs(cmdArgs); 1659 assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); 1660 auto output = appender!(char[])(); 1661 1662 tsvSample(cmdopt, output); // This invokes the main code line. 1663 1664 auto expectedOutput = expected.tsvDataToString; 1665 1666 assert(output.data == expectedOutput, 1667 formatAssertMessage( 1668 "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================", 1669 expectedOutput.to!string, output.data.to!string)); 1670 } 1671 } 1672 1673 unittest 1674 { 1675 import std.path : buildPath; 1676 import std.file : rmdirRecurse; 1677 import std.format : format; 1678 1679 auto testDir = makeUnittestTempDir("tsv_sample"); 1680 scope(exit) testDir.rmdirRecurse; 1681 1682 /* Tabular data sets and expected results use the built-in static seed. 1683 * Tests are run by writing the data set to a file, then calling the main 1684 * routine to process. The function testTsvSample plays the role of the 1685 * main program. Rather than writing to expected output, the results are 1686 * matched against expected. The expected results were verified by hand 1687 * prior to inclusion in the test. 1688 * 1689 * The initial part of this section is simply setting up data files and 1690 * expected results. 1691 * 1692 * Expected results naming conventions: 1693 * - Prefix: dataNxMExpected. N and M are numbers. e.g. data3x6Expected 1694 * - Sampling Type (required): Permute, Replace, Bernoulli, Distinct 1695 * - Compatibility: Compat, AlgoR, Skip, Swap 1696 * - Weight Field: Wt<num>, e.g. Wt3 1697 * - Sample Size: Num<num>, eg. Num3 1698 * - Seed Value: V<num>, eg. V77 1699 * - Key Field: K<num>, e.g. K2 1700 * - Probability: P<num>, e.g P05 (5%) 1701 * - Printing Probabilities: Probs 1702 * - Printing Probs in order: ProbsInorder 1703 * - Printing Probs with custom header: RVCustom 1704 */ 1705 1706 /* Empty file. */ 1707 string[][] dataEmpty = []; 1708 string fpath_dataEmpty = buildPath(testDir, "dataEmpty.tsv"); 1709 writeUnittestTsvFile(fpath_dataEmpty, dataEmpty); 1710 1711 /* 3x1, header only. */ 1712 string[][] data3x0 = [["field_a", "field_b", "field_c"]]; 1713 string fpath_data3x0 = buildPath(testDir, "data3x0.tsv"); 1714 writeUnittestTsvFile(fpath_data3x0, data3x0); 1715 1716 /* 3x1 */ 1717 string[][] data3x1 = 1718 [["field_a", "field_b", "field_c"], 1719 ["tan", "タン", "8.5"]]; 1720 1721 string fpath_data3x1 = buildPath(testDir, "data3x1.tsv"); 1722 string fpath_data3x1_noheader = buildPath(testDir, "data3x1_noheader.tsv"); 1723 writeUnittestTsvFile(fpath_data3x1, data3x1); 1724 writeUnittestTsvFile(fpath_data3x1_noheader, data3x1[1..$]); 1725 1726 string[][] data3x1ExpectedReplaceNum3 = 1727 [["field_a", "field_b", "field_c"], 1728 ["tan", "タン", "8.5"], 1729 ["tan", "タン", "8.5"], 1730 ["tan", "タン", "8.5"]]; 1731 1732 /* 3x2 */ 1733 string[][] data3x2 = 1734 [["field_a", "field_b", "field_c"], 1735 ["brown", "褐色", "29.2"], 1736 ["gray", "グレー", "6.2"]]; 1737 1738 string fpath_data3x2 = buildPath(testDir, "data3x2.tsv"); 1739 string fpath_data3x2_noheader = buildPath(testDir, "data3x2_noheader.tsv"); 1740 writeUnittestTsvFile(fpath_data3x2, data3x2); 1741 writeUnittestTsvFile(fpath_data3x2_noheader, data3x2[1..$]); 1742 1743 string[][] data3x2PermuteCompat = 1744 [["field_a", "field_b", "field_c"], 1745 ["gray", "グレー", "6.2"], 1746 ["brown", "褐色", "29.2"]]; 1747 1748 string[][] data3x2PermuteShuffle = 1749 [["field_a", "field_b", "field_c"], 1750 ["gray", "グレー", "6.2"], 1751 ["brown", "褐色", "29.2"]]; 1752 1753 /* 3x3 */ 1754 string[][] data3x3 = 1755 [["field_a", "field_b", "field_c"], 1756 ["orange", "オレンジ", "2.5"], 1757 ["pink", "ピンク", "1.1"], 1758 ["purple", "紫の", "42"]]; 1759 1760 string fpath_data3x3 = buildPath(testDir, "data3x3.tsv"); 1761 string fpath_data3x3_noheader = buildPath(testDir, "data3x3_noheader.tsv"); 1762 writeUnittestTsvFile(fpath_data3x3, data3x3); 1763 writeUnittestTsvFile(fpath_data3x3_noheader, data3x3[1..$]); 1764 1765 string[][] data3x3ExpectedPermuteCompat = 1766 [["field_a", "field_b", "field_c"], 1767 ["purple", "紫の", "42"], 1768 ["pink", "ピンク", "1.1"], 1769 ["orange", "オレンジ", "2.5"]]; 1770 1771 string[][] data3x3ExpectedPermuteSwap = 1772 [["field_a", "field_b", "field_c"], 1773 ["purple", "紫の", "42"], 1774 ["orange", "オレンジ", "2.5"], 1775 ["pink", "ピンク", "1.1"]]; 1776 1777 /* 3x6 */ 1778 string[][] data3x6 = 1779 [["field_a", "field_b", "field_c"], 1780 ["red", "赤", "23.8"], 1781 ["green", "緑", "0.0072"], 1782 ["white", "白", "1.65"], 1783 ["yellow", "黄", "12"], 1784 ["blue", "青", "12"], 1785 ["black", "黒", "0.983"]]; 1786 string fpath_data3x6 = buildPath(testDir, "data3x6.tsv"); 1787 string fpath_data3x6_noheader = buildPath(testDir, "data3x6_noheader.tsv"); 1788 writeUnittestTsvFile(fpath_data3x6, data3x6); 1789 writeUnittestTsvFile(fpath_data3x6_noheader, data3x6[1..$]); 1790 1791 // Randomization, all lines 1792 string[][] data3x6ExpectedPermuteCompat = 1793 [["field_a", "field_b", "field_c"], 1794 ["yellow", "黄", "12"], 1795 ["black", "黒", "0.983"], 1796 ["blue", "青", "12"], 1797 ["white", "白", "1.65"], 1798 ["green", "緑", "0.0072"], 1799 ["red", "赤", "23.8"]]; 1800 1801 string[][] data3x6ExpectedPermuteSwap = 1802 [["field_a", "field_b", "field_c"], 1803 ["black", "黒", "0.983"], 1804 ["green", "緑", "0.0072"], 1805 ["red", "赤", "23.8"], 1806 ["yellow", "黄", "12"], 1807 ["white", "白", "1.65"], 1808 ["blue", "青", "12"]]; 1809 1810 string[][] data3x6ExpectedPermuteCompatProbs = 1811 [["random_value", "field_a", "field_b", "field_c"], 1812 ["0.96055546286515892", "yellow", "黄", "12"], 1813 ["0.75710153928957880", "black", "黒", "0.983"], 1814 ["0.52525980887003243", "blue", "青", "12"], 1815 ["0.49287854949943721", "white", "白", "1.65"], 1816 ["0.15929344086907804", "green", "緑", "0.0072"], 1817 ["0.010968807619065046", "red", "赤", "23.8"]]; 1818 1819 /* Note: data3x6ExpectedAlgoRNum6 is identical to data3x6ExpectedPermuteSwap because 1820 * both are effectively the same algorithm given that --num is data length. Both read 1821 * in the full data in order then call randomShuffle. 1822 */ 1823 string[][] data3x6ExpectedPermuteAlgoRNum6 = 1824 [["field_a", "field_b", "field_c"], 1825 ["black", "黒", "0.983"], 1826 ["green", "緑", "0.0072"], 1827 ["red", "赤", "23.8"], 1828 ["yellow", "黄", "12"], 1829 ["white", "白", "1.65"], 1830 ["blue", "青", "12"]]; 1831 1832 string[][] data3x6ExpectedPermuteAlgoRNum5 = 1833 [["field_a", "field_b", "field_c"], 1834 ["red", "赤", "23.8"], 1835 ["black", "黒", "0.983"], 1836 ["white", "白", "1.65"], 1837 ["green", "緑", "0.0072"], 1838 ["yellow", "黄", "12"]]; 1839 1840 string[][] data3x6ExpectedPermuteAlgoRNum4 = 1841 [["field_a", "field_b", "field_c"], 1842 ["blue", "青", "12"], 1843 ["green", "緑", "0.0072"], 1844 ["black", "黒", "0.983"], 1845 ["white", "白", "1.65"]]; 1846 1847 string[][] data3x6ExpectedPermuteAlgoRNum3 = 1848 [["field_a", "field_b", "field_c"], 1849 ["red", "赤", "23.8"], 1850 ["black", "黒", "0.983"], 1851 ["green", "緑", "0.0072"]]; 1852 1853 string[][] data3x6ExpectedPermuteAlgoRNum2 = 1854 [["field_a", "field_b", "field_c"], 1855 ["black", "黒", "0.983"], 1856 ["red", "赤", "23.8"]]; 1857 1858 string[][] data3x6ExpectedPermuteAlgoRNum1 = 1859 [["field_a", "field_b", "field_c"], 1860 ["green", "緑", "0.0072"]]; 1861 1862 string[][] data3x6ExpectedBernoulliProbsP100 = 1863 [["random_value", "field_a", "field_b", "field_c"], 1864 ["0.010968807619065046", "red", "赤", "23.8"], 1865 ["0.15929344086907804", "green", "緑", "0.0072"], 1866 ["0.49287854949943721", "white", "白", "1.65"], 1867 ["0.96055546286515892", "yellow", "黄", "12"], 1868 ["0.52525980887003243", "blue", "青", "12"], 1869 ["0.75710153928957880", "black", "黒", "0.983"]]; 1870 1871 string[][] data3x6ExpectedBernoulliCompatProbsP60 = 1872 [["random_value", "field_a", "field_b", "field_c"], 1873 ["0.010968807619065046", "red", "赤", "23.8"], 1874 ["0.15929344086907804", "green", "緑", "0.0072"], 1875 ["0.49287854949943721", "white", "白", "1.65"], 1876 ["0.52525980887003243", "blue", "青", "12"]]; 1877 1878 string[][] data3x6ExpectedBernoulliSkipP40 = 1879 [["field_a", "field_b", "field_c"], 1880 ["red", "赤", "23.8"], 1881 ["green", "緑", "0.0072"], 1882 ["yellow", "黄", "12"]]; 1883 1884 string[][] data3x6ExpectedBernoulliCompatP60 = 1885 [["field_a", "field_b", "field_c"], 1886 ["red", "赤", "23.8"], 1887 ["green", "緑", "0.0072"], 1888 ["white", "白", "1.65"], 1889 ["blue", "青", "12"]]; 1890 1891 string[][] data3x6ExpectedDistinctK1K3P60 = 1892 [["field_a", "field_b", "field_c"], 1893 ["green", "緑", "0.0072"], 1894 ["white", "白", "1.65"], 1895 ["blue", "青", "12"]]; 1896 1897 string[][] data3x6ExpectedDistinctK1K3P60Probs = 1898 [["random_value", "field_a", "field_b", "field_c"], 1899 ["0", "green", "緑", "0.0072"], 1900 ["0", "white", "白", "1.65"], 1901 ["0", "blue", "青", "12"]]; 1902 1903 string[][] data3x6ExpectedDistinctK1K3P60ProbsRVCustom = 1904 [["custom_random_value_header", "field_a", "field_b", "field_c"], 1905 ["0", "green", "緑", "0.0072"], 1906 ["0", "white", "白", "1.65"], 1907 ["0", "blue", "青", "12"]]; 1908 1909 string[][] data3x6ExpectedDistinctK2P2ProbsInorder = 1910 [["random_value", "field_a", "field_b", "field_c"], 1911 ["1", "red", "赤", "23.8"], 1912 ["0", "green", "緑", "0.0072"], 1913 ["0", "white", "白", "1.65"], 1914 ["1", "yellow", "黄", "12"], 1915 ["3", "blue", "青", "12"], 1916 ["2", "black", "黒", "0.983"]]; 1917 1918 string[][] data3x6ExpectedPermuteWt3Probs = 1919 [["random_value", "field_a", "field_b", "field_c"], 1920 ["0.99665198757645390", "yellow", "黄", "12"], 1921 ["0.94775884809836686", "blue", "青", "12"], 1922 ["0.82728234682286661", "red", "赤", "23.8"], 1923 ["0.75346697377181959", "black", "黒", "0.983"], 1924 ["0.65130103496422487", "white", "白", "1.65"], 1925 ["1.5636943712879866e-111", "green", "緑", "0.0072"]]; 1926 1927 string[][] data3x6ExpectedWt3ProbsInorder = 1928 [["random_value", "field_a", "field_b", "field_c"], 1929 ["0.82728234682286661", "red", "赤", "23.8"], 1930 ["1.5636943712879866e-111", "green", "緑", "0.0072"], 1931 ["0.65130103496422487", "white", "白", "1.65"], 1932 ["0.99665198757645390", "yellow", "黄", "12"], 1933 ["0.94775884809836686", "blue", "青", "12"], 1934 ["0.75346697377181959", "black", "黒", "0.983"]]; 1935 1936 string[][] data3x6ExpectedPermuteWt3 = 1937 [["field_a", "field_b", "field_c"], 1938 ["yellow", "黄", "12"], 1939 ["blue", "青", "12"], 1940 ["red", "赤", "23.8"], 1941 ["black", "黒", "0.983"], 1942 ["white", "白", "1.65"], 1943 ["green", "緑", "0.0072"]]; 1944 1945 string[][] data3x6ExpectedReplaceNum10 = 1946 [["field_a", "field_b", "field_c"], 1947 ["black", "黒", "0.983"], 1948 ["green", "緑", "0.0072"], 1949 ["green", "緑", "0.0072"], 1950 ["red", "赤", "23.8"], 1951 ["yellow", "黄", "12"], 1952 ["red", "赤", "23.8"], 1953 ["white", "白", "1.65"], 1954 ["yellow", "黄", "12"], 1955 ["yellow", "黄", "12"], 1956 ["white", "白", "1.65"], 1957 ]; 1958 1959 string[][] data3x6ExpectedReplaceNum10V77 = 1960 [["field_a", "field_b", "field_c"], 1961 ["black", "黒", "0.983"], 1962 ["red", "赤", "23.8"], 1963 ["black", "黒", "0.983"], 1964 ["yellow", "黄", "12"], 1965 ["green", "緑", "0.0072"], 1966 ["green", "緑", "0.0072"], 1967 ["green", "緑", "0.0072"], 1968 ["yellow", "黄", "12"], 1969 ["blue", "青", "12"], 1970 ["white", "白", "1.65"], 1971 ]; 1972 1973 /* Using a different static seed. */ 1974 string[][] data3x6ExpectedPermuteCompatV41Probs = 1975 [["random_value", "field_a", "field_b", "field_c"], 1976 ["0.68057272653095424", "green", "緑", "0.0072"], 1977 ["0.67681624367833138", "blue", "青", "12"], 1978 ["0.32097338931635022", "yellow", "黄", "12"], 1979 ["0.25092361867427826", "red", "赤", "23.8"], 1980 ["0.15535934292711318", "black", "黒", "0.983"], 1981 ["0.046095821075141430", "white", "白", "1.65"]]; 1982 1983 string[][] data3x6ExpectedBernoulliCompatP60V41Probs = 1984 [["random_value", "field_a", "field_b", "field_c"], 1985 ["0.25092361867427826", "red", "赤", "23.8"], 1986 ["0.046095821075141430", "white", "白", "1.65"], 1987 ["0.32097338931635022", "yellow", "黄", "12"], 1988 ["0.15535934292711318", "black", "黒", "0.983"]]; 1989 1990 string[][] data3x6ExpectedPermuteWt3V41Probs = 1991 [["random_value", "field_a", "field_b", "field_c"], 1992 ["0.96799377498910666", "blue", "青", "12"], 1993 ["0.94356245792573568", "red", "赤", "23.8"], 1994 ["0.90964601024271996", "yellow", "黄", "12"], 1995 ["0.15491658409260103", "white", "白", "1.65"], 1996 ["0.15043620392537033", "black", "黒", "0.983"], 1997 ["6.1394674830701461e-24", "green", "緑", "0.0072"]]; 1998 1999 string[][] data3x6ExpectedWt3V41ProbsInorder = 2000 [["random_value", "field_a", "field_b", "field_c"], 2001 ["0.94356245792573568", "red", "赤", "23.8"], 2002 ["6.1394674830701461e-24", "green", "緑", "0.0072"], 2003 ["0.15491658409260103", "white", "白", "1.65"], 2004 ["0.90964601024271996", "yellow", "黄", "12"], 2005 ["0.96799377498910666", "blue", "青", "12"], 2006 ["0.15043620392537033", "black", "黒", "0.983"]]; 2007 2008 2009 /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */ 2010 string[][] combo1ExpectedPermuteCompat = 2011 [["field_a", "field_b", "field_c"], 2012 ["yellow", "黄", "12"], 2013 ["tan", "タン", "8.5"], 2014 ["brown", "褐色", "29.2"], 2015 ["green", "緑", "0.0072"], 2016 ["red", "赤", "23.8"], 2017 ["purple", "紫の", "42"], 2018 ["black", "黒", "0.983"], 2019 ["white", "白", "1.65"], 2020 ["gray", "グレー", "6.2"], 2021 ["blue", "青", "12"], 2022 ["pink", "ピンク", "1.1"], 2023 ["orange", "オレンジ", "2.5"]]; 2024 2025 string[][] combo1ExpectedPermuteCompatProbs = 2026 [["random_value", "field_a", "field_b", "field_c"], 2027 ["0.97088520275428891", "yellow", "黄", "12"], 2028 ["0.96055546286515892", "tan", "タン", "8.5"], 2029 ["0.81756894313730299", "brown", "褐色", "29.2"], 2030 ["0.75710153928957880", "green", "緑", "0.0072"], 2031 ["0.52525980887003243", "red", "赤", "23.8"], 2032 ["0.49287854949943721", "purple", "紫の", "42"], 2033 ["0.47081507067196071", "black", "黒", "0.983"], 2034 ["0.38388182921335101", "white", "白", "1.65"], 2035 ["0.29215990612283349", "gray", "グレー", "6.2"], 2036 ["0.24033216014504433", "blue", "青", "12"], 2037 ["0.15929344086907804", "pink", "ピンク", "1.1"], 2038 ["0.010968807619065046", "orange", "オレンジ", "2.5"]]; 2039 2040 /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */ 2041 string[][] combo1ExpectedProbsInorder = 2042 [["random_value", "field_a", "field_b", "field_c"], 2043 ["0.010968807619065046", "orange", "オレンジ", "2.5"], 2044 ["0.15929344086907804", "pink", "ピンク", "1.1"], 2045 ["0.49287854949943721", "purple", "紫の", "42"], 2046 ["0.96055546286515892", "tan", "タン", "8.5"], 2047 ["0.52525980887003243", "red", "赤", "23.8"], 2048 ["0.75710153928957880", "green", "緑", "0.0072"], 2049 ["0.38388182921335101", "white", "白", "1.65"], 2050 ["0.97088520275428891", "yellow", "黄", "12"], 2051 ["0.24033216014504433", "blue", "青", "12"], 2052 ["0.47081507067196071", "black", "黒", "0.983"], 2053 ["0.81756894313730299", "brown", "褐色", "29.2"], 2054 ["0.29215990612283349", "gray", "グレー", "6.2"]]; 2055 2056 string[][] combo1ExpectedBernoulliCompatP50Probs = 2057 [["random_value", "field_a", "field_b", "field_c"], 2058 ["0.010968807619065046", "orange", "オレンジ", "2.5"], 2059 ["0.15929344086907804", "pink", "ピンク", "1.1"], 2060 ["0.49287854949943721", "purple", "紫の", "42"], 2061 ["0.38388182921335101", "white", "白", "1.65"], 2062 ["0.24033216014504433", "blue", "青", "12"], 2063 ["0.47081507067196071", "black", "黒", "0.983"], 2064 ["0.29215990612283349", "gray", "グレー", "6.2"]]; 2065 2066 string[][] combo1ExpectedBernoulliCompatP40 = 2067 [["field_a", "field_b", "field_c"], 2068 ["orange", "オレンジ", "2.5"], 2069 ["pink", "ピンク", "1.1"], 2070 ["white", "白", "1.65"], 2071 ["blue", "青", "12"], 2072 ["gray", "グレー", "6.2"]]; 2073 2074 string[][] combo1ExpectedDistinctK1P40 = 2075 [["field_a", "field_b", "field_c"], 2076 ["orange", "オレンジ", "2.5"], 2077 ["red", "赤", "23.8"], 2078 ["green", "緑", "0.0072"], 2079 ["blue", "青", "12"], 2080 ["black", "黒", "0.983"]]; 2081 2082 string[][] combo1ExpectedPermuteWt3Probs = 2083 [["random_value", "field_a", "field_b", "field_c"], 2084 ["0.99754077523718754", "yellow", "黄", "12"], 2085 ["0.99527665440088786", "tan", "タン", "8.5"], 2086 ["0.99312578945741659", "brown", "褐色", "29.2"], 2087 ["0.98329602553389361", "purple", "紫の", "42"], 2088 ["0.97330961938083660", "red", "赤", "23.8"], 2089 ["0.88797551521739648", "blue", "青", "12"], 2090 ["0.81999230489041786", "gray", "グレー", "6.2"], 2091 ["0.55975569204250941", "white", "白", "1.65"], 2092 ["0.46472135609205739", "black", "黒", "0.983"], 2093 ["0.18824582704191337", "pink", "ピンク", "1.1"], 2094 ["0.16446131853299920", "orange", "オレンジ", "2.5"], 2095 ["1.6438086931020549e-17", "green", "緑", "0.0072"]]; 2096 2097 string[][] combo1ExpectedPermuteWt3 = 2098 [["field_a", "field_b", "field_c"], 2099 ["yellow", "黄", "12"], 2100 ["tan", "タン", "8.5"], 2101 ["brown", "褐色", "29.2"], 2102 ["purple", "紫の", "42"], 2103 ["red", "赤", "23.8"], 2104 ["blue", "青", "12"], 2105 ["gray", "グレー", "6.2"], 2106 ["white", "白", "1.65"], 2107 ["black", "黒", "0.983"], 2108 ["pink", "ピンク", "1.1"], 2109 ["orange", "オレンジ", "2.5"], 2110 ["green", "緑", "0.0072"]]; 2111 2112 string[][] combo1ExpectedPermuteAlgoRNum4 = 2113 [["field_a", "field_b", "field_c"], 2114 ["blue", "青", "12"], 2115 ["gray", "グレー", "6.2"], 2116 ["brown", "褐色", "29.2"], 2117 ["white", "白", "1.65"]]; 2118 2119 string[][] combo1ExpectedReplaceNum10 = 2120 [["field_a", "field_b", "field_c"], 2121 ["gray", "グレー", "6.2"], 2122 ["yellow", "黄", "12"], 2123 ["yellow", "黄", "12"], 2124 ["white", "白", "1.65"], 2125 ["tan", "タン", "8.5"], 2126 ["white", "白", "1.65"], 2127 ["blue", "青", "12"], 2128 ["black", "黒", "0.983"], 2129 ["tan", "タン", "8.5"], 2130 ["purple", "紫の", "42"]]; 2131 2132 /* 1x200 - Needed for testing bernoulliSkipSampling, invoked with prob < 0.04. */ 2133 string[][] data1x200 = 2134 [["field_a"], 2135 ["000"], ["001"], ["002"], ["003"], ["004"], ["005"], ["006"], ["007"], ["008"], ["009"], 2136 ["010"], ["011"], ["012"], ["013"], ["014"], ["015"], ["016"], ["017"], ["018"], ["019"], 2137 ["020"], ["021"], ["022"], ["023"], ["024"], ["025"], ["026"], ["027"], ["028"], ["029"], 2138 ["030"], ["031"], ["032"], ["033"], ["034"], ["035"], ["036"], ["037"], ["038"], ["039"], 2139 ["040"], ["041"], ["042"], ["043"], ["044"], ["045"], ["046"], ["047"], ["048"], ["049"], 2140 ["050"], ["051"], ["052"], ["053"], ["054"], ["055"], ["056"], ["057"], ["058"], ["059"], 2141 ["060"], ["061"], ["062"], ["063"], ["064"], ["065"], ["066"], ["067"], ["068"], ["069"], 2142 ["070"], ["071"], ["072"], ["073"], ["074"], ["075"], ["076"], ["077"], ["078"], ["079"], 2143 ["080"], ["081"], ["082"], ["083"], ["084"], ["085"], ["086"], ["087"], ["088"], ["089"], 2144 ["090"], ["091"], ["092"], ["093"], ["094"], ["095"], ["096"], ["097"], ["098"], ["099"], 2145 ["100"], ["101"], ["102"], ["103"], ["104"], ["105"], ["106"], ["107"], ["108"], ["109"], 2146 ["110"], ["111"], ["112"], ["113"], ["114"], ["115"], ["116"], ["117"], ["118"], ["119"], 2147 ["120"], ["121"], ["122"], ["123"], ["124"], ["125"], ["126"], ["127"], ["128"], ["129"], 2148 ["130"], ["131"], ["132"], ["133"], ["134"], ["135"], ["136"], ["137"], ["138"], ["139"], 2149 ["140"], ["141"], ["142"], ["143"], ["144"], ["145"], ["146"], ["147"], ["148"], ["149"], 2150 ["150"], ["151"], ["152"], ["153"], ["154"], ["155"], ["156"], ["157"], ["158"], ["159"], 2151 ["160"], ["161"], ["162"], ["163"], ["164"], ["165"], ["166"], ["167"], ["168"], ["169"], 2152 ["170"], ["171"], ["172"], ["173"], ["174"], ["175"], ["176"], ["177"], ["178"], ["179"], 2153 ["180"], ["181"], ["182"], ["183"], ["184"], ["185"], ["186"], ["187"], ["188"], ["189"], 2154 ["190"], ["191"], ["192"], ["193"], ["194"], ["195"], ["196"], ["197"], ["198"], ["199"], 2155 ]; 2156 2157 string fpath_data1x200 = buildPath(testDir, "data1x200.tsv"); 2158 string fpath_data1x200_noheader = buildPath(testDir, "data1x200_noheader.tsv"); 2159 writeUnittestTsvFile(fpath_data1x200, data1x200); 2160 writeUnittestTsvFile(fpath_data1x200_noheader, data1x200[1..$]); 2161 2162 string[][] data1x200ExpectedBernoulliSkipV333P01 = 2163 [["field_a"], 2164 ["077"], 2165 ["119"]]; 2166 2167 string[][] data1x200ExpectedBernoulliSkipV333P02 = 2168 [["field_a"], 2169 ["038"], 2170 ["059"], 2171 ["124"], 2172 ["161"], 2173 ["162"], 2174 ["183"]]; 2175 2176 string[][] data1x200ExpectedBernoulliSkipV333P03 = 2177 [["field_a"], 2178 ["025"], 2179 ["039"], 2180 ["082"], 2181 ["107"], 2182 ["108"], 2183 ["122"], 2184 ["136"], 2185 ["166"], 2186 ["182"]]; 2187 2188 string[][] data1x200ExpectedBernoulliCompatV333P01 = 2189 [["field_a"], 2190 ["072"]]; 2191 2192 string[][] data1x200ExpectedBernoulliCompatV333P02 = 2193 [["field_a"], 2194 ["004"], 2195 ["072"]]; 2196 2197 string[][] data1x200ExpectedBernoulliCompatV333P03 = 2198 [["field_a"], 2199 ["004"], 2200 ["072"], 2201 ["181"]]; 2202 2203 /* Combo 2, for bernoulli skip sampling: 3x0, 3x1, 1x200, empty, 1x10. No data files, 2204 * only expected results. The header is from 3x0, the results are offset 1-position 2205 * from data1x200ExpectedBernoulliSkipV333P03 due to insertion of a single preceding line. 2206 */ 2207 string[][] combo2ExpectedBernoulliSkipV333P03 = 2208 [["field_a", "field_b", "field_c"], 2209 ["024"], 2210 ["038"], 2211 ["081"], 2212 ["106"], 2213 ["107"], 2214 ["121"], 2215 ["135"], 2216 ["165"], 2217 ["181"]]; 2218 2219 2220 /* 1x10 - Simple 1-column file. */ 2221 string[][] data1x10 = 2222 [["field_a"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"], ["10"]]; 2223 string fpath_data1x10 = buildPath(testDir, "data1x10.tsv"); 2224 string fpath_data1x10_noheader = buildPath(testDir, "data1x10_noheader.tsv"); 2225 writeUnittestTsvFile(fpath_data1x10, data1x10); 2226 writeUnittestTsvFile(fpath_data1x10_noheader, data1x10[1..$]); 2227 2228 string[][] data1x10ExpectedPermuteCompat = 2229 [["field_a"], ["8"], ["4"], ["6"], ["5"], ["3"], ["10"], ["7"], ["9"], ["2"], ["1"]]; 2230 2231 string[][] data1x10ExpectedPermuteWt1 = 2232 [["field_a"], ["8"], ["4"], ["6"], ["10"], ["5"], ["7"], ["9"], ["3"], ["2"], ["1"]]; 2233 2234 /* 2x10a - Uniform distribution [0,1]. */ 2235 string[][] data2x10a = 2236 [["line", "weight"], 2237 ["1", "0.26788837"], 2238 ["2", "0.06601298"], 2239 ["3", "0.38627527"], 2240 ["4", "0.47379424"], 2241 ["5", "0.02966641"], 2242 ["6", "0.05636231"], 2243 ["7", "0.70529242"], 2244 ["8", "0.91836862"], 2245 ["9", "0.99103720"], 2246 ["10", "0.31401740"]]; 2247 2248 string fpath_data2x10a = buildPath(testDir, "data2x10a.tsv"); 2249 writeUnittestTsvFile(fpath_data2x10a, data2x10a); 2250 2251 string[][] data2x10aExpectedPermuteWt2Probs = 2252 [["random_value", "line", "weight"], 2253 ["0.96833865494543658", "8", "0.91836862"], 2254 ["0.91856842054413923", "4", "0.47379424"], 2255 ["0.25730832087795091", "7", "0.70529242"], 2256 ["0.23725317907018120", "9", "0.99103720"], 2257 ["0.16016096701872204", "3", "0.38627527"], 2258 ["0.090819662667243381", "10", "0.31401740"], 2259 ["0.0071764539244361172", "6", "0.05636231"], 2260 ["0.000000048318642951630057", "1", "0.26788837"], 2261 ["0.00000000037525692966535517", "5", "0.02966641"], 2262 ["8.2123247880095796e-13", "2", "0.06601298"]]; 2263 2264 /* 2x10b - Uniform distribution [0,1000]. */ 2265 string[][] data2x10b = 2266 [["line", "weight"], 2267 ["1", "761"], 2268 ["2", "432"], 2269 ["3", "103"], 2270 ["4", "448"], 2271 ["5", "750"], 2272 ["6", "711"], 2273 ["7", "867"], 2274 ["8", "841"], 2275 ["9", "963"], 2276 ["10", "784"]]; 2277 2278 string fpath_data2x10b = buildPath(testDir, "data2x10b.tsv"); 2279 writeUnittestTsvFile(fpath_data2x10b, data2x10b); 2280 2281 string[][] data2x10bExpectedPermuteWt2Probs = 2282 [["random_value", "line", "weight"], 2283 ["0.99996486739067969", "8", "841"], 2284 ["0.99991017467137211", "4", "448"], 2285 ["0.99960871524873662", "6", "711"], 2286 ["0.99914188537143800", "5", "750"], 2287 ["0.99903963250274785", "10", "784"], 2288 ["0.99889631825931946", "7", "867"], 2289 ["0.99852058315191139", "9", "963"], 2290 ["0.99575669679158918", "2", "432"], 2291 ["0.99408758732050595", "1", "761"], 2292 ["0.99315467761212362", "3", "103"]]; 2293 2294 /* 2x10c - Logarithmic distribution in random order. */ 2295 string[][] data2x10c = 2296 [["line", "weight"], 2297 ["1", "31.85"], 2298 ["2", "17403.31"], 2299 ["3", "653.84"], 2300 ["4", "8.23"], 2301 ["5", "2671.04"], 2302 ["6", "26226.08"], 2303 ["7", "1.79"], 2304 ["8", "354.56"], 2305 ["9", "35213.81"], 2306 ["10", "679.29"]]; 2307 2308 string fpath_data2x10c = buildPath(testDir, "data2x10c.tsv"); 2309 writeUnittestTsvFile(fpath_data2x10c, data2x10c); 2310 2311 string[][] data2x10cExpectedPermuteWt2Probs = 2312 [["random_value", "line", "weight"], 2313 ["0.99998939008709697", "6", "26226.08"], 2314 ["0.99995951291695517", "9", "35213.81"], 2315 ["0.99991666907613541", "8", "354.56"], 2316 ["0.99989445052186410", "2", "17403.31"], 2317 ["0.99975897602861630", "5", "2671.04"], 2318 ["0.99891852769877643", "3", "653.84"], 2319 ["0.99889167752782515", "10", "679.29"], 2320 ["0.99512207506850148", "4", "8.23"], 2321 ["0.86789371584259023", "1", "31.85"], 2322 ["0.58574438162915610", "7", "1.79"]]; 2323 2324 /* 2x10d. Logarithmic distribution in ascending order. */ 2325 string[][] data2x10d = 2326 [["line", "weight"], 2327 ["1", "1.79"], 2328 ["2", "8.23"], 2329 ["3", "31.85"], 2330 ["4", "354.56"], 2331 ["5", "653.84"], 2332 ["6", "679.29"], 2333 ["7", "2671.04"], 2334 ["8", "17403.31"], 2335 ["9", "26226.08"], 2336 ["10", "35213.81"]]; 2337 2338 string fpath_data2x10d = buildPath(testDir, "data2x10d.tsv"); 2339 writeUnittestTsvFile(fpath_data2x10d, data2x10d); 2340 2341 string[][] data2x10dExpectedPermuteWt2Probs = 2342 [["random_value", "line", "weight"], 2343 ["0.99999830221846353", "8", "17403.31"], 2344 ["0.99997860834041397", "10", "35213.81"], 2345 ["0.99994563828986716", "9", "26226.08"], 2346 ["0.99988650363575737", "4", "354.56"], 2347 ["0.99964161939190088", "7", "2671.04"], 2348 ["0.99959045338948649", "6", "679.29"], 2349 ["0.99901574490639788", "5", "653.84"], 2350 ["0.97803163304747431", "3", "31.85"], 2351 ["0.79994791806910948", "2", "8.23"], 2352 ["0.080374261239949119", "1", "1.79"]]; 2353 2354 /* 2x10e. Logarithmic distribution in descending order. */ 2355 string[][] data2x10e = 2356 [["line", "weight"], 2357 ["1", "35213.81"], 2358 ["2", "26226.08"], 2359 ["3", "17403.31"], 2360 ["4", "2671.04"], 2361 ["5", "679.29"], 2362 ["6", "653.84"], 2363 ["7", "354.56"], 2364 ["8", "31.85"], 2365 ["9", "8.23"], 2366 ["10", "1.79"]]; 2367 string fpath_data2x10e = buildPath(testDir, "data2x10e.tsv"); 2368 writeUnittestTsvFile(fpath_data2x10e, data2x10e); 2369 2370 string[][] data2x10eExpectedPermuteWt2Probs = 2371 [["random_value", "line", "weight"], 2372 ["0.99998493348975237", "4", "2671.04"], 2373 ["0.99995934807202624", "3", "17403.31"], 2374 ["0.99992995739727453", "2", "26226.08"], 2375 ["0.99987185679245649", "1", "35213.81"], 2376 ["0.99957451563173938", "6", "653.84"], 2377 ["0.99907273650209583", "8", "31.85"], 2378 ["0.99905260312968946", "5", "679.29"], 2379 ["0.99730333650516401", "7", "354.56"], 2380 ["0.84093902435227808", "9", "8.23"], 2381 ["0.65650015926290028", "10", "1.79"]]; 2382 2383 /* Data sets for distinct sampling. */ 2384 string[][] data5x25 = 2385 [["ID", "Shape", "Color", "Size", "Weight"], 2386 ["01", "circle", "red", "S", "10"], 2387 ["02", "circle", "black", "L", "20"], 2388 ["03", "square", "black", "L", "20"], 2389 ["04", "circle", "green", "L", "30"], 2390 ["05", "ellipse", "red", "S", "20"], 2391 ["06", "triangle", "red", "S", "10"], 2392 ["07", "triangle", "red", "L", "20"], 2393 ["08", "square", "black", "S", "10"], 2394 ["09", "circle", "black", "S", "20"], 2395 ["10", "square", "green", "L", "20"], 2396 ["11", "triangle", "red", "L", "20"], 2397 ["12", "circle", "green", "L", "30"], 2398 ["13", "ellipse", "red", "S", "20"], 2399 ["14", "circle", "green", "L", "30"], 2400 ["15", "ellipse", "red", "L", "30"], 2401 ["16", "square", "red", "S", "10"], 2402 ["17", "circle", "black", "L", "20"], 2403 ["18", "square", "red", "S", "20"], 2404 ["19", "square", "black", "L", "20"], 2405 ["20", "circle", "red", "S", "10"], 2406 ["21", "ellipse", "black", "L", "30"], 2407 ["22", "triangle", "red", "L", "30"], 2408 ["23", "circle", "green", "S", "20"], 2409 ["24", "square", "green", "L", "20"], 2410 ["25", "circle", "red", "S", "10"], 2411 ]; 2412 2413 string fpath_data5x25 = buildPath(testDir, "data5x25.tsv"); 2414 string fpath_data5x25_noheader = buildPath(testDir, "data5x25_noheader.tsv"); 2415 writeUnittestTsvFile(fpath_data5x25, data5x25); 2416 writeUnittestTsvFile(fpath_data5x25_noheader, data5x25[1..$]); 2417 2418 string[][] data5x25ExpectedDistinctK2P40 = 2419 [["ID", "Shape", "Color", "Size", "Weight"], 2420 ["03", "square", "black", "L", "20"], 2421 ["05", "ellipse", "red", "S", "20"], 2422 ["08", "square", "black", "S", "10"], 2423 ["10", "square", "green", "L", "20"], 2424 ["13", "ellipse", "red", "S", "20"], 2425 ["15", "ellipse", "red", "L", "30"], 2426 ["16", "square", "red", "S", "10"], 2427 ["18", "square", "red", "S", "20"], 2428 ["19", "square", "black", "L", "20"], 2429 ["21", "ellipse", "black", "L", "30"], 2430 ["24", "square", "green", "L", "20"], 2431 ]; 2432 2433 string[][] data5x25ExpectedDistinctK2K4P20 = 2434 [["ID", "Shape", "Color", "Size", "Weight"], 2435 ["03", "square", "black", "L", "20"], 2436 ["07", "triangle", "red", "L", "20"], 2437 ["08", "square", "black", "S", "10"], 2438 ["10", "square", "green", "L", "20"], 2439 ["11", "triangle", "red", "L", "20"], 2440 ["16", "square", "red", "S", "10"], 2441 ["18", "square", "red", "S", "20"], 2442 ["19", "square", "black", "L", "20"], 2443 ["22", "triangle", "red", "L", "30"], 2444 ["24", "square", "green", "L", "20"], 2445 ]; 2446 2447 string[][] data5x25ExpectedDistinctK2K3K4P20 = 2448 [["ID", "Shape", "Color", "Size", "Weight"], 2449 ["04", "circle", "green", "L", "30"], 2450 ["07", "triangle", "red", "L", "20"], 2451 ["09", "circle", "black", "S", "20"], 2452 ["11", "triangle", "red", "L", "20"], 2453 ["12", "circle", "green", "L", "30"], 2454 ["14", "circle", "green", "L", "30"], 2455 ["16", "square", "red", "S", "10"], 2456 ["18", "square", "red", "S", "20"], 2457 ["22", "triangle", "red", "L", "30"], 2458 ]; 2459 2460 /* Fields 2 and 4 from data5x25. Distinct rows should be the same for equiv keys. */ 2461 string[][] data2x25 = 2462 [["Shape", "Size"], 2463 ["circle", "S"], 2464 ["circle", "L"], 2465 ["square", "L"], 2466 ["circle", "L"], 2467 ["ellipse", "S"], 2468 ["triangle", "S"], 2469 ["triangle", "L"], 2470 ["square", "S"], 2471 ["circle", "S"], 2472 ["square", "L"], 2473 ["triangle", "L"], 2474 ["circle", "L"], 2475 ["ellipse", "S"], 2476 ["circle", "L"], 2477 ["ellipse", "L"], 2478 ["square", "S"], 2479 ["circle", "L"], 2480 ["square", "S"], 2481 ["square", "L"], 2482 ["circle", "S"], 2483 ["ellipse", "L"], 2484 ["triangle", "L"], 2485 ["circle", "S"], 2486 ["square", "L"], 2487 ["circle", "S"], 2488 ]; 2489 2490 string fpath_data2x25 = buildPath(testDir, "data2x25.tsv"); 2491 string fpath_data2x25_noheader = buildPath(testDir, "data2x25_noheader.tsv"); 2492 writeUnittestTsvFile(fpath_data2x25, data2x25); 2493 writeUnittestTsvFile(fpath_data2x25_noheader, data2x25[1..$]); 2494 2495 string[][] data2x25ExpectedDistinctK1K2P20 = 2496 [["Shape", "Size"], 2497 ["square", "L"], 2498 ["triangle", "L"], 2499 ["square", "S"], 2500 ["square", "L"], 2501 ["triangle", "L"], 2502 ["square", "S"], 2503 ["square", "S"], 2504 ["square", "L"], 2505 ["triangle", "L"], 2506 ["square", "L"], 2507 ]; 2508 2509 string[][] data1x25 = 2510 [["Shape-Size"], 2511 ["circle-S"], 2512 ["circle-L"], 2513 ["square-L"], 2514 ["circle-L"], 2515 ["ellipse-S"], 2516 ["triangle-S"], 2517 ["triangle-L"], 2518 ["square-S"], 2519 ["circle-S"], 2520 ["square-L"], 2521 ["triangle-L"], 2522 ["circle-L"], 2523 ["ellipse-S"], 2524 ["circle-L"], 2525 ["ellipse-L"], 2526 ["square-S"], 2527 ["circle-L"], 2528 ["square-S"], 2529 ["square-L"], 2530 ["circle-S"], 2531 ["ellipse-L"], 2532 ["triangle-L"], 2533 ["circle-S"], 2534 ["square-L"], 2535 ["circle-S"], 2536 ]; 2537 2538 string fpath_data1x25 = buildPath(testDir, "data1x25.tsv"); 2539 string fpath_data1x25_noheader = buildPath(testDir, "data1x25_noheader.tsv"); 2540 writeUnittestTsvFile(fpath_data1x25, data1x25); 2541 writeUnittestTsvFile(fpath_data1x25_noheader, data1x25[1..$]); 2542 2543 string[][] data1x25ExpectedDistinctK1P20 = 2544 [["Shape-Size"], 2545 ["triangle-L"], 2546 ["square-S"], 2547 ["triangle-L"], 2548 ["ellipse-L"], 2549 ["square-S"], 2550 ["square-S"], 2551 ["ellipse-L"], 2552 ["triangle-L"], 2553 ]; 2554 2555 string[][] data1x25ExpectedDistinctK1P20Probs = 2556 [["random_value", "Shape-Size"], 2557 ["0", "triangle-L"], 2558 ["0", "square-S"], 2559 ["0", "triangle-L"], 2560 ["0", "ellipse-L"], 2561 ["0", "square-S"], 2562 ["0", "square-S"], 2563 ["0", "ellipse-L"], 2564 ["0", "triangle-L"], 2565 ]; 2566 2567 string[][] data1x25ExpectedDistinctK1P20ProbsInorder = 2568 [["random_value", "Shape-Size"], 2569 ["1", "circle-S"], 2570 ["4", "circle-L"], 2571 ["2", "square-L"], 2572 ["4", "circle-L"], 2573 ["2", "ellipse-S"], 2574 ["1", "triangle-S"], 2575 ["0", "triangle-L"], 2576 ["0", "square-S"], 2577 ["1", "circle-S"], 2578 ["2", "square-L"], 2579 ["0", "triangle-L"], 2580 ["4", "circle-L"], 2581 ["2", "ellipse-S"], 2582 ["4", "circle-L"], 2583 ["0", "ellipse-L"], 2584 ["0", "square-S"], 2585 ["4", "circle-L"], 2586 ["0", "square-S"], 2587 ["2", "square-L"], 2588 ["1", "circle-S"], 2589 ["0", "ellipse-L"], 2590 ["0", "triangle-L"], 2591 ["1", "circle-S"], 2592 ["2", "square-L"], 2593 ["1", "circle-S"], 2594 ]; 2595 2596 /* 2597 * Enough setup! Actually run some tests! 2598 */ 2599 2600 /* Permutations. Headers, static seed, compatibility mode. With weights and without. */ 2601 testTsvSample(["test-a1", "--header", "--static-seed", "--compatibility-mode", fpath_dataEmpty], dataEmpty); 2602 testTsvSample(["test-a2", "--header", "--static-seed", "--compatibility-mode", fpath_data3x0], data3x0); 2603 testTsvSample(["test-a3", "-H", "-s", "--compatibility-mode", fpath_data3x1], data3x1); 2604 testTsvSample(["test-a4", "-H", "-s", "--compatibility-mode", fpath_data3x2], data3x2PermuteCompat); 2605 testTsvSample(["test-a5", "-H", "-s", "--compatibility-mode", fpath_data3x3], data3x3ExpectedPermuteCompat); 2606 testTsvSample(["test-a6", "-H", "-s", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat); 2607 testTsvSample(["test-a7", "-H", "-s", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs); 2608 testTsvSample(["test-a8", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3); 2609 testTsvSample(["test-a9", "-H", "-s", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 2610 testTsvSample(["test-a10", "-H", "--seed-value", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); 2611 testTsvSample(["test-a11", "-H", "-s", "-v", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); 2612 testTsvSample(["test-a12", "-H", "-s", "-v", "0", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs); 2613 testTsvSample(["test-a13", "-H", "-v", "41", "-w", "3", "--print-random", fpath_data3x6], data3x6ExpectedPermuteWt3V41Probs); 2614 2615 /* Permutations, without compatibility mode, or with both compatibility and printing. */ 2616 testTsvSample(["test-aa1", "--header", "--static-seed", fpath_dataEmpty], dataEmpty); 2617 testTsvSample(["test-aa2", "--header", "--static-seed", fpath_data3x0], data3x0); 2618 testTsvSample(["test-aa3", "-H", "-s", fpath_data3x1], data3x1); 2619 testTsvSample(["test-aa4", "-H", "-s", fpath_data3x2], data3x2PermuteShuffle); 2620 testTsvSample(["test-aa5", "-H", "-s", fpath_data3x3], data3x3ExpectedPermuteSwap); 2621 testTsvSample(["test-aa6", "-H", "-s", fpath_data3x6], data3x6ExpectedPermuteSwap); 2622 testTsvSample(["test-aa7", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3); 2623 testTsvSample(["test-aa8", "-H", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 2624 testTsvSample(["test-aa9", "-H", "--seed-value", "41", "--print-random", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); 2625 2626 /* Reservoir sampling using Algorithm R. 2627 * (Note: reservoirSamplingViaHeap is tested later in the length-based iteration loops.) 2628 */ 2629 testTsvSample(["test-aa10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty); 2630 testTsvSample(["test-aa11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty); 2631 testTsvSample(["test-aa12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x0], data3x0); 2632 testTsvSample(["test-aa13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x0], data3x0); 2633 testTsvSample(["test-aa14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x1], data3x1); 2634 testTsvSample(["test-aa15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x1], data3x1); 2635 testTsvSample(["test-aa16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum6); 2636 testTsvSample(["test-aa17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum6); 2637 testTsvSample(["test-aa18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum5); 2638 testTsvSample(["test-aa19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum4); 2639 testTsvSample(["test-aa20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum3); 2640 testTsvSample(["test-aa21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum2); 2641 testTsvSample(["test-aa22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x6], data3x6ExpectedPermuteAlgoRNum1); 2642 2643 /* Bernoulli sampling cases. */ 2644 testTsvSample(["test-a14", "--header", "--static-seed", "--prob", "0.001", fpath_dataEmpty], dataEmpty); 2645 testTsvSample(["test-a15", "--header", "--static-seed", "--prob", "0.001", fpath_data3x0], data3x0); 2646 testTsvSample(["test-a16", "-H", "-s", "-p", "1.0", fpath_data3x1], data3x1); 2647 testTsvSample(["test-a17", "-H", "-s", "-p", "1.0", fpath_data3x6], data3x6); 2648 testTsvSample(["test-a18", "-H", "-p", "1.0", fpath_data3x6], data3x6); 2649 testTsvSample(["test-a19", "-H", "-s", "--prob", "1.0", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 2650 testTsvSample(["test-a20", "-H", "-s", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60); 2651 testTsvSample(["test-a21", "-H", "-s", "--prob", "0.60", fpath_data3x6], data3x6ExpectedBernoulliCompatP60); 2652 testTsvSample(["test-a22", "-H", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatP60V41Probs); 2653 2654 /* Bernoulli sampling with probabilities in skip sampling range or preferring skip sampling. */ 2655 testTsvSample(["test-ab1", "-H", "--seed-value", "333", "--prob", "0.01", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P01); 2656 testTsvSample(["test-ab2", "-H", "--seed-value", "333", "--prob", "0.02", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P02); 2657 testTsvSample(["test-ab3", "-H", "--seed-value", "333", "--prob", "0.03", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03); 2658 testTsvSample(["test-ab4", "-H", "--seed-value", "333", "--prob", "0.01", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P01); 2659 testTsvSample(["test-ab5", "-H", "--seed-value", "333", "--prob", "0.02", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P02); 2660 testTsvSample(["test-ab6", "-H", "--seed-value", "333", "--prob", "0.03", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P03); 2661 testTsvSample(["test-ab7", "-H", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6], data3x6ExpectedBernoulliSkipP40); 2662 2663 /* Distinct sampling cases. */ 2664 testTsvSample(["test-a23", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_dataEmpty], dataEmpty); 2665 testTsvSample(["test-a24", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_data3x0], data3x0); 2666 testTsvSample(["test-a25", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x1], data3x1); 2667 testTsvSample(["test-a26", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x6], data3x6); 2668 testTsvSample(["test-a27", "-H", "-s", "-p", "0.6", "-k", "1,3", fpath_data3x6], data3x6ExpectedDistinctK1K3P60); 2669 2670 2671 2672 /* Generating random weights. Use Bernoulli sampling test set at prob 100% for uniform sampling. 2673 * For weighted sampling, use the weighted cases, but with expected using the original ordering. 2674 */ 2675 testTsvSample(["test-a28", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 2676 testTsvSample(["test-a29", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 2677 testTsvSample(["test-a30", "-H", "-s", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6], 2678 data3x6ExpectedWt3ProbsInorder); 2679 testTsvSample(["test-a31", "-H", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6], 2680 data3x6ExpectedWt3V41ProbsInorder); 2681 testTsvSample(["test-a32", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6], 2682 data3x6ExpectedDistinctK1K3P60Probs); 2683 testTsvSample(["test-a33", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", "--random-value-header", 2684 "custom_random_value_header", fpath_data3x6], data3x6ExpectedDistinctK1K3P60ProbsRVCustom); 2685 testTsvSample(["test-a34", "-H", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6], 2686 data3x6ExpectedDistinctK2P2ProbsInorder); 2687 2688 /* Simple random sampling with replacement. */ 2689 testTsvSample(["test-a35", "-H", "-s", "--replace", fpath_dataEmpty], dataEmpty); 2690 testTsvSample(["test-a36", "-H", "-s", "--replace", "--num", "3", fpath_dataEmpty], dataEmpty); 2691 testTsvSample(["test-a37", "-H", "-s", "--replace", fpath_data3x0], data3x0); 2692 testTsvSample(["test-a38", "-H", "-s", "--replace", "--num", "3", fpath_data3x0], data3x0); 2693 testTsvSample(["test-a39", "-H", "-s", "--replace", "--num", "3", fpath_data3x1], data3x1ExpectedReplaceNum3); 2694 testTsvSample(["test-a40", "-H", "-s", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10); 2695 testTsvSample(["test-a41", "-H", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10V77); 2696 2697 /* Permutations, compatibility mode, without headers. */ 2698 testTsvSample(["test-b1", "-s", "--compatibility-mode", fpath_data3x1_noheader], data3x1[1..$]); 2699 testTsvSample(["test-b2", "-s", "--compatibility-mode", fpath_data3x2_noheader], data3x2PermuteCompat[1..$]); 2700 testTsvSample(["test-b3", "-s", "--compatibility-mode", fpath_data3x3_noheader], data3x3ExpectedPermuteCompat[1..$]); 2701 testTsvSample(["test-b4", "-s", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1..$]); 2702 testTsvSample(["test-b5", "-s", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1..$]); 2703 testTsvSample(["test-b6", "-s", "--weight-field", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..$]); 2704 testTsvSample(["test-b7", "-s", "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..$]); 2705 testTsvSample(["test-b8", "-v", "41", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1..$]); 2706 testTsvSample(["test-b9", "-v", "41", "-w", "3", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3V41Probs[1..$]); 2707 2708 /* Permutations, no headers, without compatibility mode, or with printing and compatibility mode. */ 2709 testTsvSample(["test-bb1", "-s", fpath_data3x1_noheader], data3x1[1..$]); 2710 testTsvSample(["test-bb2", "-s", fpath_data3x2_noheader], data3x2PermuteShuffle[1..$]); 2711 testTsvSample(["test-bb3", "-s", fpath_data3x3_noheader], data3x3ExpectedPermuteSwap[1..$]); 2712 testTsvSample(["test-bb4", "-s", fpath_data3x6_noheader], data3x6ExpectedPermuteSwap[1..$]); 2713 testTsvSample(["test-bb5", "-s", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..$]); 2714 testTsvSample(["test-bb6", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..$]); 2715 testTsvSample(["test-bb7", "-v", "41", "--print-random", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1..$]); 2716 2717 /* Reservoir sampling using Algorithm R, no headers. */ 2718 testTsvSample(["test-aa10", "--prefer-algorithm-r", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty); 2719 testTsvSample(["test-aa11", "--prefer-algorithm-r", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty); 2720 testTsvSample(["test-aa14", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x1_noheader], data3x1[1..$]); 2721 testTsvSample(["test-aa15", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x1_noheader], data3x1[1..$]); 2722 testTsvSample(["test-aa16", "--prefer-algorithm-r", "-s", "--num", "7", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum6[1..$]); 2723 testTsvSample(["test-aa17", "--prefer-algorithm-r", "-s", "--num", "6", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum6[1..$]); 2724 testTsvSample(["test-aa18", "--prefer-algorithm-r", "-s", "--num", "5", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum5[1..$]); 2725 testTsvSample(["test-aa19", "--prefer-algorithm-r", "-s", "--num", "4", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum4[1..$]); 2726 testTsvSample(["test-aa20", "--prefer-algorithm-r", "-s", "--num", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum3[1..$]); 2727 testTsvSample(["test-aa21", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum2[1..$]); 2728 testTsvSample(["test-aa22", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x6_noheader], data3x6ExpectedPermuteAlgoRNum1[1..$]); 2729 2730 /* Bernoulli sampling cases. */ 2731 testTsvSample(["test-b10", "-s", "-p", "1.0", fpath_data3x1_noheader], data3x1[1..$]); 2732 testTsvSample(["test-b11", "-s", "-p", "1.0", fpath_data3x6_noheader], data3x6[1..$]); 2733 testTsvSample(["test-b12", "-p", "1.0", fpath_data3x6_noheader], data3x6[1..$]); 2734 testTsvSample(["test-b13", "-s", "--prob", "1.0", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..$]); 2735 testTsvSample(["test-b14", "-s", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1..$]); 2736 testTsvSample(["test-b15", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60V41Probs[1..$]); 2737 2738 /* Bernoulli sampling with probabilities in skip sampling range. */ 2739 testTsvSample(["test-bb1", "-v", "333", "-p", "0.01", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P01[1..$]); 2740 testTsvSample(["test-bb2", "-v", "333", "-p", "0.02", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P02[1..$]); 2741 testTsvSample(["test-bb3", "-v", "333", "-p", "0.03", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..$]); 2742 testTsvSample(["test-bb4", "-v", "333", "-p", "0.01", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P01[1..$]); 2743 testTsvSample(["test-bb5", "-v", "333", "-p", "0.02", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P02[1..$]); 2744 testTsvSample(["test-bb6", "-v", "333", "-p", "0.03", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P03[1..$]); 2745 testTsvSample(["test-bb7", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6_noheader], data3x6ExpectedBernoulliSkipP40[1..$]); 2746 2747 /* Distinct sampling cases. */ 2748 testTsvSample(["test-b16", "-s", "-p", "1.0", "-k", "2", fpath_data3x1_noheader], data3x1[1..$]); 2749 testTsvSample(["test-b17", "-s", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]); 2750 testTsvSample(["test-b18", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]); 2751 testTsvSample(["test-b19", "-v", "71563", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1..$]); 2752 2753 /* Generating random weights. Reuse Bernoulli sampling tests at prob 100%. */ 2754 testTsvSample(["test-b20", "-s", "--gen-random-inorder", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..$]); 2755 testTsvSample(["test-b23", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedWt3V41ProbsInorder[1..$]); 2756 testTsvSample(["test-b24", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6_noheader], 2757 data3x6ExpectedDistinctK1K3P60Probs[1..$]); 2758 testTsvSample(["test-b24", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6_noheader], 2759 data3x6ExpectedDistinctK2P2ProbsInorder[1..$]); 2760 2761 /* Simple random sampling with replacement. */ 2762 testTsvSample(["test-b25", "-s", "--replace", fpath_dataEmpty], dataEmpty); 2763 testTsvSample(["test-b26", "-s", "-r", "--num", "3", fpath_dataEmpty], dataEmpty); 2764 testTsvSample(["test-b27", "-s", "-r", "-n", "3", fpath_data3x1_noheader], data3x1ExpectedReplaceNum3[1..$]); 2765 testTsvSample(["test-b28", "-s", "--replace", "-n", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10[1..$]); 2766 testTsvSample(["test-b29", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10V77[1..$]); 2767 2768 /* Multi-file tests. */ 2769 testTsvSample(["test-c1", "--header", "--static-seed", "--compatibility-mode", 2770 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2771 combo1ExpectedPermuteCompat); 2772 testTsvSample(["test-c2", "--header", "--static-seed", "--print-random", 2773 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2774 combo1ExpectedPermuteCompatProbs); 2775 testTsvSample(["test-c3", "--header", "--static-seed", "--print-random", "--weight-field", "3", 2776 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2777 combo1ExpectedPermuteWt3Probs); 2778 testTsvSample(["test-c4", "--header", "--static-seed", "--weight-field", "3", "--compatibility-mode", 2779 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2780 combo1ExpectedPermuteWt3); 2781 testTsvSample(["test-c5", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4", 2782 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2783 combo1ExpectedPermuteAlgoRNum4); 2784 2785 /* Multi-file, no headers. */ 2786 testTsvSample(["test-c6", "--static-seed", "--compatibility-mode", 2787 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2788 fpath_data3x6_noheader, fpath_data3x2_noheader], 2789 combo1ExpectedPermuteCompat[1..$]); 2790 testTsvSample(["test-c7", "--static-seed", "--print-random", 2791 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2792 fpath_data3x6_noheader, fpath_data3x2_noheader], 2793 combo1ExpectedPermuteCompatProbs[1..$]); 2794 testTsvSample(["test-c8", "--static-seed", "--print-random", "--weight-field", "3", 2795 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2796 fpath_data3x6_noheader, fpath_data3x2_noheader], 2797 combo1ExpectedPermuteWt3Probs[1..$]); 2798 testTsvSample(["test-c9", "--static-seed", "--weight-field", "3", "--compatibility-mode", 2799 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2800 fpath_data3x6_noheader, fpath_data3x2_noheader], 2801 combo1ExpectedPermuteWt3[1..$]); 2802 testTsvSample(["test-c10", "--static-seed", "--prefer-algorithm-r", "--num", "4", 2803 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2804 fpath_data3x6_noheader, fpath_data3x2_noheader], 2805 combo1ExpectedPermuteAlgoRNum4[1..$]); 2806 2807 /* Bernoulli sampling cases. */ 2808 testTsvSample(["test-c11", "--header", "--static-seed", "--print-random", "--prob", ".5", 2809 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2810 combo1ExpectedBernoulliCompatP50Probs); 2811 testTsvSample(["test-c12", "--header", "--static-seed", "--prob", ".4", 2812 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2813 combo1ExpectedBernoulliCompatP40); 2814 testTsvSample(["test-c13", "--static-seed", "--print-random", "--prob", ".5", 2815 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2816 fpath_data3x6_noheader, fpath_data3x2_noheader], 2817 combo1ExpectedBernoulliCompatP50Probs[1..$]); 2818 testTsvSample(["test-c14", "--static-seed", "--prob", ".4", 2819 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2820 fpath_data3x6_noheader, fpath_data3x2_noheader], 2821 combo1ExpectedBernoulliCompatP40[1..$]); 2822 2823 /* Bernoulli sampling with probabilities in skip sampling range. */ 2824 testTsvSample(["test-cc1", "-H", "-v", "333", "-p", "0.03", 2825 fpath_data3x0, fpath_data3x1, fpath_data1x200, fpath_dataEmpty, fpath_data1x10], 2826 combo2ExpectedBernoulliSkipV333P03); 2827 testTsvSample(["test-cc1", "-v", "333", "-p", "0.03", 2828 fpath_data3x1_noheader, fpath_data1x200_noheader, fpath_dataEmpty, fpath_data1x10_noheader], 2829 combo2ExpectedBernoulliSkipV333P03[1..$]); 2830 2831 /* Distinct sampling cases. */ 2832 testTsvSample(["test-c13", "--header", "--static-seed", "--key-fields", "1", "--prob", ".4", 2833 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2834 combo1ExpectedDistinctK1P40); 2835 testTsvSample(["test-c14", "--static-seed", "--key-fields", "1", "--prob", ".4", 2836 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2837 fpath_data3x6_noheader, fpath_data3x2_noheader], 2838 combo1ExpectedDistinctK1P40[1..$]); 2839 2840 /* Generating random weights. */ 2841 testTsvSample(["test-c15", "--header", "--static-seed", "--gen-random-inorder", 2842 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2843 combo1ExpectedProbsInorder); 2844 testTsvSample(["test-c16", "--static-seed", "--gen-random-inorder", 2845 fpath_data3x3_noheader, fpath_data3x1_noheader, 2846 fpath_dataEmpty, fpath_data3x6_noheader, fpath_data3x2_noheader], 2847 combo1ExpectedProbsInorder[1..$]); 2848 2849 /* Simple random sampling with replacement. */ 2850 testTsvSample(["test-c17", "--header", "--static-seed", "--replace", "--num", "10", 2851 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 2852 combo1ExpectedReplaceNum10); 2853 2854 testTsvSample(["test-c18", "--static-seed", "--replace", "--num", "10", 2855 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 2856 fpath_data3x6_noheader, fpath_data3x2_noheader], 2857 combo1ExpectedReplaceNum10[1..$]); 2858 2859 /* Single column file. */ 2860 testTsvSample(["test-d1", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat); 2861 testTsvSample(["test-d2", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat); 2862 2863 /* Distributions. */ 2864 testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10a], data2x10aExpectedPermuteWt2Probs); 2865 testTsvSample(["test-e2", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10b], data2x10bExpectedPermuteWt2Probs); 2866 testTsvSample(["test-e3", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10c], data2x10cExpectedPermuteWt2Probs); 2867 testTsvSample(["test-e4", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10d], data2x10dExpectedPermuteWt2Probs); 2868 testTsvSample(["test-e5", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10e], data2x10eExpectedPermuteWt2Probs); 2869 2870 /* Tests of subset sample (--n|num) field. 2871 * 2872 * Note: The way these tests are done ensures that subset length does not affect 2873 * output order. 2874 */ 2875 import std.algorithm : min; 2876 for (size_t n = data3x6.length + 2; n >= 1; n--) 2877 { 2878 /* reservoirSamplingViaHeap. 2879 */ 2880 size_t expectedLength = min(data3x6.length, n + 1); 2881 testTsvSample([format("test-f1_%d", n), "-s", "-n", n.to!string, 2882 "-H", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]); 2883 2884 testTsvSample([format("test-f2_%d", n), "-s", "-n", n.to!string, 2885 "-H", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]); 2886 2887 testTsvSample([format("test-f3_%d", n), "-s", "-n", n.to!string, 2888 "-H", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs[0..expectedLength]); 2889 2890 testTsvSample([format("test-f4_%d", n), "-s", "-n", n.to!string, 2891 "-H", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3[0..expectedLength]); 2892 2893 testTsvSample([format("test-f5_%d", n), "-s", "-n", n.to!string, 2894 "-H", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs[0..expectedLength]); 2895 2896 testTsvSample([format("test-f6_%d", n), "-s", "-n", n.to!string, 2897 fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1..expectedLength]); 2898 2899 testTsvSample([format("test-f7_%d", n), "-s", "-n", n.to!string, 2900 "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1..expectedLength]); 2901 2902 testTsvSample([format("test-f8_%d", n), "-s", "-n", n.to!string, 2903 "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..expectedLength]); 2904 2905 testTsvSample([format("test-f9_%d", n), "-s", "-n", n.to!string, 2906 "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..expectedLength]); 2907 2908 /* Bernoulli sampling. 2909 */ 2910 import std.algorithm : min; 2911 size_t sampleExpectedLength = min(expectedLength, data3x6ExpectedBernoulliCompatProbsP60.length); 2912 2913 testTsvSample([format("test-f10_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 2914 "-H", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60[0..sampleExpectedLength]); 2915 2916 testTsvSample([format("test-f11_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 2917 "-H", fpath_data3x6], data3x6ExpectedBernoulliCompatP60[0..sampleExpectedLength]); 2918 2919 testTsvSample([format("test-f12_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 2920 "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1..sampleExpectedLength]); 2921 2922 testTsvSample([format("test-f13_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 2923 fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60[1..sampleExpectedLength]); 2924 2925 /* Distinct Sampling. 2926 */ 2927 size_t distinctExpectedLength = min(expectedLength, data3x6ExpectedDistinctK1K3P60.length); 2928 2929 testTsvSample([format("test-f14_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string, 2930 "-H", fpath_data3x6], data3x6ExpectedDistinctK1K3P60[0..distinctExpectedLength]); 2931 2932 testTsvSample([format("test-f15_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string, 2933 fpath_data3x6_noheader], data3x6ExpectedDistinctK1K3P60[1..distinctExpectedLength]); 2934 2935 testTsvSample([format("test-f16_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string, 2936 "-H", fpath_data3x6], data3x6ExpectedBernoulliProbsP100[0..expectedLength]); 2937 2938 testTsvSample([format("test-f17_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string, 2939 fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..expectedLength]); 2940 } 2941 2942 /* Similar tests with the 1x10 data set. */ 2943 for (size_t n = data1x10.length + 2; n >= 1; n--) 2944 { 2945 size_t expectedLength = min(data1x10.length, n + 1); 2946 testTsvSample([format("test-g1_%d", n), "-s", "-n", n.to!string, 2947 "-H", fpath_data1x10], data1x10ExpectedPermuteCompat[0..expectedLength]); 2948 2949 testTsvSample([format("test-g2_%d", n), "-s", "-n", n.to!string, 2950 "-H", "-w", "1", fpath_data1x10], data1x10ExpectedPermuteWt1[0..expectedLength]); 2951 2952 testTsvSample([format("test-g3_%d", n), "-s", "-n", n.to!string, 2953 fpath_data1x10_noheader], data1x10ExpectedPermuteCompat[1..expectedLength]); 2954 2955 testTsvSample([format("test-g4_%d", n), "-s", "-n", n.to!string, 2956 "-w", "1", fpath_data1x10_noheader], data1x10ExpectedPermuteWt1[1..expectedLength]); 2957 } 2958 2959 /* Simple random sampling with replacement: ensure sample size doesn't change order. */ 2960 for (size_t n = data3x6ExpectedReplaceNum10.length - 1; n >= 1; n--) 2961 { 2962 testTsvSample([format("test-h1_%d", n), "-s", "--replace", "-n", n.to!string, "-H", fpath_data3x6], 2963 data3x6ExpectedReplaceNum10[0 .. n + 1]); 2964 2965 testTsvSample([format("test-h2_%d", n), "-s", "--replace", "-n", n.to!string, fpath_data3x6_noheader], 2966 data3x6ExpectedReplaceNum10[1 .. n + 1]); 2967 } 2968 2969 /* Bernoulli skip sampling. Test with lengths both greater than and less than expected. */ 2970 for (size_t n = data1x200ExpectedBernoulliSkipV333P03.length + 2; n >= 1; n--) 2971 { 2972 size_t expectedLength = min(data1x200ExpectedBernoulliSkipV333P03.length, n + 1); 2973 2974 testTsvSample([format("test-i1_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string, 2975 "-H", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03[0..expectedLength]); 2976 2977 testTsvSample([format("test-i2_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string, 2978 fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..expectedLength]); 2979 } 2980 2981 2982 /* Distinct sampling tests. */ 2983 testTsvSample(["test-j1", "--header", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25], 2984 data5x25ExpectedDistinctK2P40); 2985 2986 testTsvSample(["test-j2", "-H", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25], 2987 data5x25ExpectedDistinctK2K4P20); 2988 2989 testTsvSample(["test-j3", "-H", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25], 2990 data5x25ExpectedDistinctK2K3K4P20); 2991 2992 testTsvSample(["test-j4", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25_noheader], 2993 data5x25ExpectedDistinctK2P40[1..$]); 2994 2995 testTsvSample(["test-j5", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25_noheader], 2996 data5x25ExpectedDistinctK2K4P20[1..$]); 2997 2998 testTsvSample(["test-j6", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25_noheader], 2999 data5x25ExpectedDistinctK2K3K4P20[1..$]); 3000 3001 3002 /* These distinct tests check that the whole line as '-k 0' and specifying all fields 3003 * in order have the same result. Also that field numbers don't matter, as '-k 1,2' 3004 * in data2x25 are the same keys as '-k 2,4' in data5x25. 3005 */ 3006 testTsvSample(["test-j7", "-H", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25], 3007 data2x25ExpectedDistinctK1K2P20); 3008 3009 testTsvSample(["test-j8", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data2x25], 3010 data2x25ExpectedDistinctK1K2P20); 3011 3012 testTsvSample(["test-j9", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25_noheader], 3013 data2x25ExpectedDistinctK1K2P20[1..$]); 3014 3015 testTsvSample(["test-j10", "-s", "-p", "0.20", "-k", "0", fpath_data2x25_noheader], 3016 data2x25ExpectedDistinctK1K2P20[1..$]); 3017 3018 /* Similar to the last set, but for a 1-column file. Also with random value printing. */ 3019 testTsvSample(["test-j11", "-H", "-s", "-p", "0.20", "-k", "1", fpath_data1x25], 3020 data1x25ExpectedDistinctK1P20); 3021 3022 testTsvSample(["test-j12", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data1x25], 3023 data1x25ExpectedDistinctK1P20); 3024 3025 testTsvSample(["test-j13", "-s", "-p", "0.20", "-k", "1", fpath_data1x25_noheader], 3026 data1x25ExpectedDistinctK1P20[1..$]); 3027 3028 testTsvSample(["test-j14", "-s", "-p", "0.20", "-k", "0", fpath_data1x25_noheader], 3029 data1x25ExpectedDistinctK1P20[1..$]); 3030 3031 3032 testTsvSample(["test-j15", "-H", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25], 3033 data1x25ExpectedDistinctK1P20Probs); 3034 3035 testTsvSample(["test-j16", "-H", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25], 3036 data1x25ExpectedDistinctK1P20Probs); 3037 3038 testTsvSample(["test-j17", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25_noheader], 3039 data1x25ExpectedDistinctK1P20Probs[1..$]); 3040 3041 testTsvSample(["test-j18", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25_noheader], 3042 data1x25ExpectedDistinctK1P20Probs[1..$]); 3043 3044 3045 testTsvSample(["test-j19", "-H", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25], 3046 data1x25ExpectedDistinctK1P20ProbsInorder); 3047 3048 testTsvSample(["test-j20", "-H", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25], 3049 data1x25ExpectedDistinctK1P20ProbsInorder); 3050 3051 testTsvSample(["test-j21", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25_noheader], 3052 data1x25ExpectedDistinctK1P20ProbsInorder[1..$]); 3053 3054 testTsvSample(["test-j22", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25_noheader], 3055 data1x25ExpectedDistinctK1P20ProbsInorder[1..$]); 3056 3057 }