1 /** 2 Command line tool for shuffling or sampling lines from input streams. Several methods 3 are available, including weighted and unweighted shuffling, simple and weighted random 4 sampling, sampling with replacement, Bernoulli sampling, and distinct sampling. 5 6 Copyright (c) 2017-2020, eBay Inc. 7 Initially written by Jon Degenhardt 8 9 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 10 */ 11 module tsv_utils.tsv_sample; 12 13 import std.array : appender, Appender, RefAppender; 14 import std.range; 15 import std.stdio; 16 import std.typecons : tuple, Flag; 17 18 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 19 20 version(unittest) 21 { 22 // When running unit tests, use main from -main compiler switch. 23 } 24 else 25 { 26 /** Main program. 27 * 28 * Invokes command line argument processing and calls tsvSample to do the real 29 * work. Errors occurring during processing are caught and reported to the user. 30 */ 31 int main(string[] cmdArgs) 32 { 33 /* When running in DMD code coverage mode, turn on report merging. */ 34 version(D_Coverage) version(DigitalMars) 35 { 36 import core.runtime : dmd_coverSetMerge; 37 dmd_coverSetMerge(true); 38 } 39 40 TsvSampleOptions cmdopt; 41 const r = cmdopt.processArgs(cmdArgs); 42 if (!r[0]) return r[1]; 43 version(LDC_Profile) 44 { 45 import ldc.profile : resetAll; 46 resetAll(); 47 } 48 try 49 { 50 import tsv_utils.common.utils : BufferedOutputRange; 51 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 52 53 tsvSample(cmdopt, bufferedOutput); 54 } 55 catch (Exception exc) 56 { 57 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 58 return 1; 59 } 60 return 0; 61 } 62 } 63 64 immutable helpText = q"EOS 65 Synopsis: tsv-sample [options] [file...] 66 67 Sample input lines or randomize their order. Several modes of operation 68 are available: 69 * Shuffling (the default): All input lines are output in random order. All 70 orderings are equally likely. 71 * Random sampling (--n|num N): A random sample of N lines are selected and 72 written to standard output. By default, selected lines are written in 73 random order. All sample sets and orderings are equally likely. Use 74 --i|inorder to write the selected lines in the original input order. 75 * Weighted random sampling (--n|num N, --w|weight-field F): A weighted 76 sample of N lines is produced. Weights are taken from field F. Lines are 77 output in weighted selection order. Use --i|inorder to write in original 78 input order. Omit --n|num to shuffle all lines (weighted shuffling). 79 * Sampling with replacement (--r|replace, --n|num N): All input lines are 80 read in, then lines are repeatedly selected at random and written out. 81 This continues until N lines are output. Individual lines can be written 82 multiple times. Output continues forever if N is zero or not provided. 83 * Bernoulli sampling (--p|prob P): A random subset of lines is selected 84 based on probability P, a 0.0-1.0 value. This is a streaming operation. 85 A decision is made on each line as it is read. Line order is not changed. 86 * Distinct sampling (--k|key-fields F, --p|prob P): Input lines are sampled 87 based on the values in the key fields. A subset of keys are chosen based 88 on the inclusion probability (a 'distinct' set of keys). All lines with 89 one of the selected keys are output. Line order is not changed. 90 91 Use '--help-verbose' for detailed information. 92 93 Options: 94 EOS"; 95 96 immutable helpTextVerbose = q"EOS 97 Synopsis: tsv-sample [options] [file...] 98 99 Sample input lines or randomize their order. Several modes of operation 100 are available: 101 * Shuffling (the default): All input lines are output in random order. All 102 orderings are equally likely. 103 * Random sampling (--n|num N): A random sample of N lines are selected and 104 written to standard output. By default, selected lines are written in 105 random order. All sample sets and orderings are equally likely. Use 106 --i|inorder to write the selected lines in the original input order. 107 * Weighted random sampling (--n|num N, --w|weight-field F): A weighted 108 sample of N lines is produced. Weights are taken from field F. Lines are 109 output in weighted selection order. Use --i|inorder to write in original 110 input order. Omit --n|num to shuffle all lines (weighted shuffling). 111 * Sampling with replacement (--r|replace, --n|num N): All input lines are 112 read in, then lines are repeatedly selected at random and written out. 113 This continues until N lines are output. Individual lines can be written 114 multiple times. Output continues forever if N is zero or not provided. 115 * Bernoulli sampling (--p|prob P): A random subset of lines is selected 116 based on probability P, a 0.0-1.0 value. This is a streaming operation. 117 A decision is made on each line as it is read. Line order is not changed. 118 * Distinct sampling (--k|key-fields F, --p|prob P): Input lines are sampled 119 based on the values in the key fields. A subset of keys are chosen based 120 on the inclusion probability (a 'distinct' set of keys). All lines with 121 one of the selected keys are output. Line order is not changed. 122 123 Sample size: The '--n|num' option controls the sample size for all 124 sampling methods. In the case of simple and weighted random sampling it 125 also limits the amount of memory required. 126 127 Controlling the random seed: By default, each run produces a different 128 randomization or sampling. Using '--s|static-seed' changes this so 129 multiple runs produce the same results. This works by using the same 130 random seed each run. The random seed can be specified using 131 '--v|seed-value'. This takes a non-zero, 32-bit positive integer. (A zero 132 value is a no-op and ignored.) 133 134 Memory use: Bernoulli sampling and distinct sampling make decisions on 135 each line as it is read, there is no memory accumulation. These algorithms 136 can run on arbitrary size inputs. Sampling with replacement reads all 137 lines into memory and is limited by available memory. Shuffling also reads 138 all lines into memory and is similarly limited. Random sampling uses 139 reservoir sampling, and only needs to hold the sample size (--n|num) in 140 memory. The input data can be of any length. 141 142 Weighted sampling: Weighted random sampling is done using an algorithm 143 described by Pavlos Efraimidis and Paul Spirakis. Weights should be 144 positive values representing the relative weight of the entry in the 145 collection. Counts and similar can be used as weights, it is *not* 146 necessary to normalize to a [0,1] interval. Negative values are not 147 meaningful and given the value zero. Input order is not retained, instead 148 lines are output ordered by the randomized weight that was assigned. This 149 means that a smaller valid sample can be produced by taking the first N 150 lines of output. For more info on the sampling approach see: 151 * Wikipedia: https://en.wikipedia.org/wiki/Reservoir_sampling 152 * "Weighted Random Sampling over Data Streams", Pavlos S. Efraimidis 153 (https://arxiv.org/abs/1012.0256) 154 155 Printing random values: Most of the sampling algorithms work by generating 156 a random value for each line. (See "Compatibility mode" below.) The nature 157 of these values depends on the sampling algorithm. They are used for both 158 line selection and output ordering. The '--p|print-random' option can be 159 used to print these values. The random value is prepended to the line 160 separated by the --d|delimiter char (TAB by default). The 161 '--gen-random-inorder' option takes this one step further, generating 162 random values for all input lines without changing the input order. The 163 types of values currently used by these sampling algorithms: 164 * Unweighted sampling: Uniform random value in the interval [0,1]. This 165 includes Bernoulli sampling and unweighted line order randomization. 166 * Weighted sampling: Value in the interval [0,1]. Distribution depends on 167 the values in the weight field. It is used as a partial ordering. 168 * Distinct sampling: An integer, zero and up, representing a selection 169 group. The inclusion probability determines the number of selection groups. 170 * Sampling with replacement: Random value printing is not supported. 171 172 The specifics behind these random values are subject to change in future 173 releases. 174 175 Compatibility mode: As described above, many of the sampling algorithms 176 assign a random value to each line. This is useful when printing random 177 values. It has another occasionally useful property: repeated runs with 178 the same static seed but different selection parameters are more 179 compatible with each other, as each line gets assigned the same random 180 value on every run. For example, if Bernoulli sampling is run with 181 '--prob 0.2 --static-seed', then run again with '--prob 0.3 --static-seed', 182 all the lines selected in the first run will be selected in the second. 183 This comes at a cost: in some cases there are faster algorithms that don't 184 preserve this property. By default, tsv-sample will use faster algorithms 185 when available. However, the '--compatibility-mode' option switches to 186 algorithms that assign a random value per line. Printing random values 187 also engages compatibility mode. 188 189 Options: 190 EOS"; 191 192 /** Container for command line options and derived data. 193 * 194 * TsvSampleOptions handles several aspects of command line options. On the input side, 195 * it defines the command line options available, performs validation, and sets up any 196 * derived state based on the options provided. These activities are handled by the 197 * processArgs() member. 198 * 199 * Once argument processing is complete, TsvSampleOptions is used as a container 200 * holding the specific processing options used by the different sampling routines. 201 */ 202 struct TsvSampleOptions 203 { 204 string programName; /// Program name 205 string[] files; /// Input files 206 bool helpVerbose = false; /// --help-verbose 207 bool hasHeader = false; /// --H|header 208 ulong sampleSize = 0; /// --n|num - Size of the desired sample 209 double inclusionProbability = double.nan; /// --p|prob - Inclusion probability 210 size_t[] keyFields; /// --k|key-fields - Used with inclusion probability 211 size_t weightField = 0; /// --w|weight-field - Field holding the weight 212 bool srsWithReplacement = false; /// --r|replace 213 bool preserveInputOrder = false; /// --i|inorder 214 bool staticSeed = false; /// --s|static-seed 215 uint seedValueOptionArg = 0; /// --v|seed-value 216 bool printRandom = false; /// --print-random 217 bool genRandomInorder = false; /// --gen-random-inorder 218 string randomValueHeader = "random_value"; /// --random-value-header 219 bool compatibilityMode = false; /// --compatibility-mode 220 char delim = '\t'; /// --d|delimiter 221 bool versionWanted = false; /// --V|version 222 bool preferSkipSampling = false; /// --prefer-skip-sampling 223 bool preferAlgorithmR = false; /// --prefer-algorithm-r 224 bool hasWeightField = false; /// Derived. 225 bool useBernoulliSampling = false; /// Derived. 226 bool useDistinctSampling = false; /// Derived. 227 bool distinctKeyIsFullLine = false; /// Derived. True if '--k|key-fields 0' is specfied. 228 bool usingUnpredictableSeed = true; /// Derived from --static-seed, --seed-value 229 uint seed = 0; /// Derived from --static-seed, --seed-value 230 231 /** Process tsv-sample command line arguments. 232 * 233 * Defines the command line options, performs validation, and derives additional 234 * state. std.getopt.getopt is called to do the main option processing followed 235 * additional validation and derivation. 236 * 237 * Help text is printed to standard output if help was requested. Error text is 238 * written to stderr if invalid input is encountered. 239 * 240 * A tuple is returned. First value is true if command line arguments were 241 * successfully processed and execution should continue, or false if an error 242 * occurred or the user asked for help. If false, the second value is the 243 * appropriate exit code (0 or 1). 244 * 245 * Returning true (execution continues) means args have been validated and derived 246 * values calculated. Field indices will have been converted to zero-based. 247 */ 248 auto processArgs(ref string[] cmdArgs) 249 { 250 import std.algorithm : any, canFind, each; 251 import std.getopt; 252 import std.math : isNaN; 253 import std.path : baseName, stripExtension; 254 import std.typecons : Yes, No; 255 import tsv_utils.common.utils : makeFieldListOptionHandler; 256 257 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 258 259 try 260 { 261 arraySep = ","; // Use comma to separate values in command line options 262 auto r = getopt( 263 cmdArgs, 264 "help-verbose", " Print more detailed help.", &helpVerbose, 265 266 std.getopt.config.caseSensitive, 267 "H|header", " Treat the first line of each file as a header.", &hasHeader, 268 std.getopt.config.caseInsensitive, 269 270 "n|num", "NUM Maximum number of lines to output. All selected lines are output if not provided or zero.", &sampleSize, 271 "p|prob", "NUM Inclusion probability (0.0 < NUM <= 1.0). For Bernoulli sampling, the probability each line is selected output. For distinct sampling, the probability each unique key is selected for output.", &inclusionProbability, 272 273 "k|key-fields", "<field-list> Fields to use as key for distinct sampling. Use with '--p|prob'. Specify '--k|key-fields 0' to use the entire line as the key.", 274 keyFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero), 275 276 "w|weight-field", "NUM Field containing weights. All lines get equal weight if not provided or zero.", &weightField, 277 "r|replace", " Simple random sampling with replacement. Use --n|num to specify the sample size.", &srsWithReplacement, 278 "i|inorder", " Output random samples in original input order. Requires use of --n|num.", &preserveInputOrder, 279 "s|static-seed", " Use the same random seed every run.", &staticSeed, 280 281 std.getopt.config.caseSensitive, 282 "v|seed-value", "NUM Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg, 283 std.getopt.config.caseInsensitive, 284 285 "print-random", " Include the assigned random value (prepended) when writing output lines.", &printRandom, 286 "gen-random-inorder", " Output all lines with assigned random values prepended, no changes to the order of input.", &genRandomInorder, 287 "random-value-header", " Header to use with --print-random and --gen-random-inorder. Default: 'random_value'.", &randomValueHeader, 288 "compatibility-mode", " Turns on 'compatibility-mode'. Use --help-verbose for information.", &compatibilityMode, 289 290 "d|delimiter", "CHR Field delimiter.", &delim, 291 292 std.getopt.config.caseSensitive, 293 "V|version", " Print version information and exit.", &versionWanted, 294 std.getopt.config.caseInsensitive, 295 296 "prefer-skip-sampling", " (Internal) Prefer the skip-sampling algorithm for Bernoulli sampling. Used for testing and diagnostics.", 297 &preferSkipSampling, 298 299 "prefer-algorithm-r", " (Internal) Prefer Algorithm R for unweighted line order randomization. Used for testing and diagnostics.", 300 &preferAlgorithmR, 301 ); 302 303 if (r.helpWanted) 304 { 305 defaultGetoptPrinter(helpText, r.options); 306 return tuple(false, 0); 307 } 308 else if (helpVerbose) 309 { 310 defaultGetoptPrinter(helpTextVerbose, r.options); 311 return tuple(false, 0); 312 } 313 else if (versionWanted) 314 { 315 import tsv_utils.common.tsvutils_version; 316 writeln(tsvutilsVersionNotice("tsv-sample")); 317 return tuple(false, 0); 318 } 319 320 /* Derivations and validations. */ 321 if (weightField > 0) 322 { 323 hasWeightField = true; 324 weightField--; // Switch to zero-based indexes. 325 } 326 327 if (srsWithReplacement) 328 { 329 if (hasWeightField) 330 { 331 throw new Exception("Sampling with replacement (--r|replace) does not support weights (--w|weight-field)."); 332 } 333 else if (!inclusionProbability.isNaN) 334 { 335 throw new Exception("Sampling with replacement (--r|replace) cannot be used with probabilities (--p|prob)."); 336 } 337 else if (keyFields.length > 0) 338 { 339 throw new Exception("Sampling with replacement (--r|replace) cannot be used with distinct sampling (--k|key-fields)."); 340 } 341 else if (printRandom || genRandomInorder) 342 { 343 throw new Exception("Sampling with replacement (--r|replace) does not support random value printing (--print-random, --gen-random-inorder)."); 344 } 345 else if (preserveInputOrder) 346 { 347 throw new Exception("Sampling with replacement (--r|replace) does not support input order preservation (--i|inorder option)."); 348 } 349 } 350 351 if (keyFields.length > 0) 352 { 353 /* Note: useDistinctSampling is set as part of the inclusion probability checks below. */ 354 355 if (inclusionProbability.isNaN) throw new Exception("--p|prob is required when using --k|key-fields."); 356 357 if (keyFields.length == 1 && keyFields[0] == 0) 358 { 359 distinctKeyIsFullLine = true; 360 } 361 else 362 { 363 if (keyFields.length > 1 && keyFields.any!(x => x == 0)) 364 { 365 throw new Exception("Whole line as key (--k|key-fields 0) cannot be combined with multiple fields."); 366 } 367 368 keyFields.each!((ref x) => --x); // Convert to zero-based indexing. 369 } 370 } 371 372 /* Inclusion probability (--p|prob) is used for both Bernoulli sampling and distinct sampling. */ 373 if (!inclusionProbability.isNaN) 374 { 375 if (inclusionProbability <= 0.0 || inclusionProbability > 1.0) 376 { 377 import std.format : format; 378 throw new Exception( 379 format("Invalid --p|prob option: %g. Must satisfy 0.0 < prob <= 1.0.", inclusionProbability)); 380 } 381 382 if (keyFields.length > 0) useDistinctSampling = true; 383 else useBernoulliSampling = true; 384 385 if (hasWeightField) throw new Exception("--w|weight-field and --p|prob cannot be used together."); 386 387 if (genRandomInorder && !useDistinctSampling) 388 { 389 throw new Exception("--gen-random-inorder and --p|prob can only be used together if --k|key-fields is also used."); 390 } 391 } 392 else if (genRandomInorder && !hasWeightField) 393 { 394 useBernoulliSampling = true; 395 } 396 397 if (randomValueHeader.length == 0 || randomValueHeader.canFind('\n') || 398 randomValueHeader.canFind(delim)) 399 { 400 throw new Exception("--randomValueHeader must be at least one character and not contain field delimiters or newlines."); 401 } 402 403 /* Check for incompatible use of (--i|inorder) and shuffling of the full 404 * data set. Sampling with replacement is also incompatible, this is 405 * detected earlier. Shuffling is the default operation, so it identified 406 * by eliminating the other modes of operation. 407 */ 408 if (preserveInputOrder && 409 sampleSize == 0 && 410 !useBernoulliSampling && 411 !useDistinctSampling 412 ) 413 { 414 throw new Exception("Preserving input order (--i|inorder) is not compatible with full data set shuffling. Switch to random sampling with a sample size (--n|num) to use --i|inorder."); 415 } 416 417 /* Compatibility mode checks: 418 * - Random value printing implies compatibility-mode, otherwise user's 419 * selection is used. 420 * - Distinct sampling doesn't support compatibility-mode. The routines 421 * don't care, but users might expect larger probabilities to be a 422 * superset of smaller probabilities. This would be confusing, so 423 * flag it as an error. 424 */ 425 if (compatibilityMode && useDistinctSampling) 426 { 427 throw new Exception("Distinct sampling (--k|key-fields --p|prob) does not support --compatibility-mode."); 428 } 429 430 if (printRandom || genRandomInorder) compatibilityMode = true; 431 432 /* Seed. */ 433 import std.random : unpredictableSeed; 434 435 usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0); 436 437 if (usingUnpredictableSeed) seed = unpredictableSeed; 438 else if (seedValueOptionArg != 0) seed = seedValueOptionArg; 439 else if (staticSeed) seed = 2438424139; 440 else assert(0, "Internal error, invalid seed option states."); 441 442 /* Assume remaining args are files. Use standard input if files were not provided. */ 443 files ~= (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 444 cmdArgs.length = 1; 445 } 446 catch (Exception exc) 447 { 448 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 449 return tuple(false, 1); 450 } 451 return tuple(true, 0); 452 } 453 } 454 /** Invokes the appropriate sampling routine based on the command line arguments. 455 * 456 * tsvSample is the top-level routine handling the different tsv-sample use cases. 457 * Its primary role is to invoke the correct routine for type of sampling requested. 458 */ 459 void tsvSample(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 460 if (isOutputRange!(OutputRange, char)) 461 { 462 if (cmdopt.srsWithReplacement) 463 { 464 simpleRandomSamplingWithReplacement(cmdopt, outputStream); 465 } 466 else if (cmdopt.useBernoulliSampling) 467 { 468 bernoulliSamplingCommand(cmdopt, outputStream); 469 } 470 else if (cmdopt.useDistinctSampling) 471 { 472 if (cmdopt.genRandomInorder) distinctSampling!(Yes.generateRandomAll)(cmdopt, outputStream); 473 else distinctSampling!(No.generateRandomAll)(cmdopt, outputStream); 474 } 475 else if (cmdopt.genRandomInorder) 476 { 477 /* Note that the preceding cases handle gen-random-inorder themselves (Bernoulli, 478 * Distinct), or don't handle it (SRS w/ Replacement). 479 */ 480 assert(cmdopt.hasWeightField); 481 generateWeightedRandomValuesInorder(cmdopt, outputStream); 482 } 483 else if (cmdopt.sampleSize != 0) 484 { 485 randomSamplingCommand(cmdopt, outputStream); 486 } 487 else 488 { 489 shuffleCommand(cmdopt, outputStream); 490 } 491 } 492 493 /** Bernoulli sampling command handler. Invokes the appropriate Bernoulli sampling 494 * routine based on the command line arguments. 495 * 496 * This routine selects the appropriate Bernoulli sampling function and template 497 * instantiation to use based on the command line arguments. 498 * 499 * One of the basic choices is whether to use the vanilla algorithm or skip sampling. 500 * Skip sampling is a little bit faster when the inclusion probability is small but 501 * doesn't support compatibility mode. See the bernoulliSkipSampling documentation 502 * for a discussion of the skipSamplingProbabilityThreshold used here. 503 */ 504 void bernoulliSamplingCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 505 if (isOutputRange!(OutputRange, char)) 506 { 507 assert(!cmdopt.hasWeightField); 508 509 immutable double skipSamplingProbabilityThreshold = 0.04; 510 511 if (cmdopt.compatibilityMode || 512 (cmdopt.inclusionProbability > skipSamplingProbabilityThreshold && !cmdopt.preferSkipSampling)) 513 { 514 if (cmdopt.genRandomInorder) 515 { 516 bernoulliSampling!(Yes.generateRandomAll)(cmdopt, outputStream); 517 } 518 else 519 { 520 bernoulliSampling!(No.generateRandomAll)(cmdopt, outputStream); 521 } 522 } 523 else 524 { 525 bernoulliSkipSampling(cmdopt, outputStream); 526 } 527 } 528 529 /** Bernoulli sampling of lines from the input stream. 530 * 531 * Each input line is a assigned a random value and output if less than 532 * cmdopt.inclusionProbability. The order of the lines is not changed. 533 * 534 * This routine supports random value printing and gen-random-inorder value printing. 535 */ 536 void bernoulliSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange) 537 (TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 538 if (isOutputRange!(OutputRange, char)) 539 { 540 import std.random : Random = Mt19937, uniform01; 541 import tsv_utils.common.utils : bufferedByLine, throwIfWindowsNewlineOnUnix; 542 543 static if (generateRandomAll) assert(cmdopt.genRandomInorder); 544 else assert(!cmdopt.genRandomInorder); 545 546 auto randomGenerator = Random(cmdopt.seed); 547 548 /* Process each line. */ 549 bool headerWritten = false; 550 ulong numLinesWritten = 0; 551 foreach (filename; cmdopt.files) 552 { 553 auto inputStream = (filename == "-") ? stdin : filename.File(); 554 foreach (ulong fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1)) 555 { 556 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 557 if (fileLineNum == 1 && cmdopt.hasHeader) 558 { 559 if (!headerWritten) 560 { 561 static if (generateRandomAll) 562 { 563 outputStream.put(cmdopt.randomValueHeader); 564 outputStream.put(cmdopt.delim); 565 } 566 else if (cmdopt.printRandom) 567 { 568 outputStream.put(cmdopt.randomValueHeader); 569 outputStream.put(cmdopt.delim); 570 } 571 572 outputStream.put(line); 573 outputStream.put("\n"); 574 headerWritten = true; 575 } 576 } 577 else 578 { 579 immutable double lineScore = uniform01(randomGenerator); 580 581 static if (generateRandomAll) 582 { 583 outputStream.formatRandomValue(lineScore); 584 outputStream.put(cmdopt.delim); 585 outputStream.put(line); 586 outputStream.put("\n"); 587 588 if (cmdopt.sampleSize != 0) 589 { 590 ++numLinesWritten; 591 if (numLinesWritten == cmdopt.sampleSize) return; 592 } 593 } 594 else if (lineScore < cmdopt.inclusionProbability) 595 { 596 if (cmdopt.printRandom) 597 { 598 outputStream.formatRandomValue(lineScore); 599 outputStream.put(cmdopt.delim); 600 } 601 outputStream.put(line); 602 outputStream.put("\n"); 603 604 if (cmdopt.sampleSize != 0) 605 { 606 ++numLinesWritten; 607 if (numLinesWritten == cmdopt.sampleSize) return; 608 } 609 } 610 } 611 } 612 } 613 } 614 615 /** bernoulliSkipSampling is an implementation of Bernoulli sampling using skips. 616 * 617 * Skip sampling works by skipping a random number of lines between selections. This 618 * can be faster than assigning a random value to each line when the inclusion 619 * probability is low, as it reduces the number of calls to the random number 620 * generator. Both the random number generator and the log() function are called when 621 * calculating the next skip size. These additional log() calls add up as the 622 * inclusion probability increases. 623 * 624 * Performance tests indicate the break-even point is about 4-5% (--prob 0.04) for 625 * file-oriented line sampling. This is obviously environment specific. In the 626 * environments this implementation has been tested in the performance improvements 627 * remain small, less than 7%, even with an inclusion probability as low as 0.0001. 628 * 629 * The algorithm does not assign random values to individual lines. This makes it 630 * incompatible with random value printing. It is not suitable for compatibility mode 631 * either. As an example, in compatibility mode a line selected with '--prob 0.2' should 632 * also be selected with '--prob 0.3' (assuming the same random seed). Skip sampling 633 * does not have this property. 634 * 635 * The algorithm for calculating the skip size has been described by multiple sources. 636 * There are two key variants depending on whether the total number of lines in the 637 * data set is known in advance. (This implementation does not know the total.) 638 * Useful references: 639 * $(LIST 640 * * Jeffrey Scott Vitter, "An Efficient Algorithm for Sequential Random Sampling", 641 * ACM Trans on Mathematical Software, 1987. On-line: 642 * http://www.ittc.ku.edu/~jsv/Papers/Vit87.RandomSampling.pdf 643 * * P.J. Haas, "Data-Stream Sampling: Basic Techniques and Results", from the book 644 * "Data Stream Management", Springer-Verlag, 2016. On-line: 645 * https://www.springer.com/cda/content/document/cda_downloaddocument/9783540286073-c2.pdf 646 * * Erik Erlandson, "Faster Random Samples With Gap Sampling", 2014. On-line: 647 * http://erikerlandson.github.io/blog/2014/09/11/faster-random-samples-with-gap-sampling/ 648 * ) 649 */ 650 void bernoulliSkipSampling(OutputRange)(TsvSampleOptions cmdopt, OutputRange outputStream) 651 if (isOutputRange!(OutputRange, char)) 652 { 653 import std.conv : to; 654 import std.math : log, trunc; 655 import std.random : Random = Mt19937, uniform01; 656 import tsv_utils.common.utils : bufferedByLine, throwIfWindowsNewlineOnUnix; 657 658 assert(cmdopt.inclusionProbability > 0.0 && cmdopt.inclusionProbability < 1.0); 659 assert(!cmdopt.printRandom); 660 assert(!cmdopt.compatibilityMode); 661 662 auto randomGenerator = Random(cmdopt.seed); 663 664 immutable double discardRate = 1.0 - cmdopt.inclusionProbability; 665 immutable double logDiscardRate = log(discardRate); 666 667 /* Note: The '1.0 - uniform01(randomGenerator)' expression flips the half closed 668 * interval to (0.0, 1.0], excluding 0.0. 669 */ 670 size_t remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t; 671 672 /* Process each line. */ 673 bool headerWritten = false; 674 ulong numLinesWritten = 0; 675 foreach (filename; cmdopt.files) 676 { 677 auto inputStream = (filename == "-") ? stdin : filename.File(); 678 foreach (ulong fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1)) 679 { 680 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 681 if (fileLineNum == 1 && cmdopt.hasHeader) 682 { 683 if (!headerWritten) 684 { 685 outputStream.put(line); 686 outputStream.put("\n"); 687 headerWritten = true; 688 } 689 } 690 else if (remainingSkips > 0) 691 { 692 --remainingSkips; 693 } 694 else 695 { 696 outputStream.put(line); 697 outputStream.put("\n"); 698 699 if (cmdopt.sampleSize != 0) 700 { 701 ++numLinesWritten; 702 if (numLinesWritten == cmdopt.sampleSize) return; 703 } 704 705 remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t; 706 } 707 } 708 } 709 } 710 711 /** Sample lines by choosing a random set of distinct keys formed from one or more 712 * fields on each line. 713 * 714 * Distinct sampling is a streaming form of sampling, similar to Bernoulli sampling. 715 * However, instead of each line being subject to an independent trial, lines are 716 * selected based on a key from each line. A portion of keys are randomly selected for 717 * output, and every line containing a selected key is included in the output. 718 * 719 * An example use-case is a query log having <user, query, clicked-url> triples. It is 720 * often useful to sample records for portion of the users, but including all records 721 * for the users selected. Distinct sampling supports this by selecting a subset of 722 * users to include in the output. 723 * 724 * Distinct sampling is done by hashing the key and mapping the hash value into 725 * buckets sized to hold the inclusion probability. Records having a key mapping to 726 * bucket zero are output. Buckets are equal size and therefore may be larger than the 727 * inclusion probability. (The other approach would be to have the caller specify the 728 * the number of buckets. More correct, but less convenient.) 729 */ 730 void distinctSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange) 731 (TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 732 if (isOutputRange!(OutputRange, char)) 733 { 734 import std.algorithm : splitter; 735 import std.conv : to; 736 import std.digest.murmurhash; 737 import std.math : lrint; 738 import tsv_utils.common.utils : bufferedByLine, InputFieldReordering, throwIfWindowsNewlineOnUnix; 739 740 static if (generateRandomAll) assert(cmdopt.genRandomInorder); 741 else assert(!cmdopt.genRandomInorder); 742 743 assert(cmdopt.keyFields.length > 0); 744 assert(0.0 < cmdopt.inclusionProbability && cmdopt.inclusionProbability <= 1.0); 745 746 static if (generateRandomAll) 747 { 748 import std.format : formatValue, singleSpec; 749 immutable randomValueFormatSpec = singleSpec("%d"); 750 } 751 752 immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys. 753 754 uint numBuckets = (1.0 / cmdopt.inclusionProbability).lrint.to!uint; 755 756 /* Create a mapping for the key fields. */ 757 auto keyFieldsReordering = cmdopt.distinctKeyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields); 758 759 /* Process each line. */ 760 bool headerWritten = false; 761 ulong numLinesWritten = 0; 762 foreach (filename; cmdopt.files) 763 { 764 auto inputStream = (filename == "-") ? stdin : filename.File(); 765 foreach (ulong fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1)) 766 { 767 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 768 if (fileLineNum == 1 && cmdopt.hasHeader) 769 { 770 if (!headerWritten) 771 { 772 static if (generateRandomAll) 773 { 774 outputStream.put(cmdopt.randomValueHeader); 775 outputStream.put(cmdopt.delim); 776 } 777 else if (cmdopt.printRandom) 778 { 779 outputStream.put(cmdopt.randomValueHeader); 780 outputStream.put(cmdopt.delim); 781 } 782 783 outputStream.put(line); 784 outputStream.put("\n"); 785 headerWritten = true; 786 } 787 } 788 else 789 { 790 /* Murmurhash works by successively adding individual keys, then finalizing. 791 * Adding individual keys is simpler if the full-line-as-key and individual 792 * fields as keys cases are separated. 793 */ 794 auto hasher = MurmurHash3!32(cmdopt.seed); 795 796 if (cmdopt.distinctKeyIsFullLine) 797 { 798 hasher.put(cast(ubyte[]) line); 799 } 800 else 801 { 802 assert(keyFieldsReordering !is null); 803 804 /* Gather the key field values and assemble the key. */ 805 keyFieldsReordering.initNewLine; 806 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 807 { 808 keyFieldsReordering.processNextField(fieldIndex, fieldValue); 809 if (keyFieldsReordering.allFieldsFilled) break; 810 } 811 812 if (!keyFieldsReordering.allFieldsFilled) 813 { 814 import std.format : format; 815 throw new Exception( 816 format("Not enough fields in line. File: %s, Line: %s", 817 (filename == "-") ? "Standard Input" : filename, fileLineNum)); 818 } 819 820 foreach (count, key; keyFieldsReordering.outputFields.enumerate) 821 { 822 if (count > 0) hasher.put(delimArray); 823 hasher.put(cast(ubyte[]) key); 824 } 825 } 826 827 hasher.finish; 828 829 static if (generateRandomAll) 830 { 831 import std.conv : to; 832 outputStream.formatValue(hasher.get % numBuckets, randomValueFormatSpec); 833 outputStream.put(cmdopt.delim); 834 outputStream.put(line); 835 outputStream.put("\n"); 836 837 if (cmdopt.sampleSize != 0) 838 { 839 ++numLinesWritten; 840 if (numLinesWritten == cmdopt.sampleSize) return; 841 } 842 } 843 else if (hasher.get % numBuckets == 0) 844 { 845 if (cmdopt.printRandom) 846 { 847 outputStream.put('0'); 848 outputStream.put(cmdopt.delim); 849 } 850 outputStream.put(line); 851 outputStream.put("\n"); 852 853 if (cmdopt.sampleSize != 0) 854 { 855 ++numLinesWritten; 856 if (numLinesWritten == cmdopt.sampleSize) return; 857 } 858 } 859 } 860 } 861 } 862 } 863 864 /** Random sampling command handler. Invokes the appropriate sampling routine based on 865 * the command line arguments. 866 * 867 * Random sampling selects a fixed size random sample from the input stream. Both 868 * simple random sampling (equal likelihood) and weighted random sampling are 869 * supported. Selected lines are output either in random order or original input order. 870 * For weighted sampling the random order is the weighted selection order. 871 * 872 * Two algorithms are used, reservoir sampling via a heap and reservoir sampling via 873 * Algorithm R. This routine selects the appropriate reservoir sampling function and 874 * template instantiation to based on the command line arguments. 875 * 876 * Weighted sampling always uses the heap approach. Compatibility mode does as well, 877 * as it is the method that uses per-line random value assignments. The implication 878 * of compatibility mode is that a larger sample size includes all the results from 879 * a smaller sample, assuming the same random seed is used. 880 * 881 * For unweighted sampling there is a performance tradeoff between implementations. 882 * Heap-based sampling is faster for small sample sizes. Algorithm R is faster for 883 * large sample sizes. The threshold used was chosen based on performance tests. See 884 * the reservoirSamplingAlgorithmR documentation for more information. 885 */ 886 887 void randomSamplingCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 888 if (isOutputRange!(OutputRange, char)) 889 { 890 assert(cmdopt.sampleSize != 0); 891 892 immutable size_t algorithmRSampleSizeThreshold = 128 * 1024; 893 894 if (cmdopt.hasWeightField) 895 { 896 if (cmdopt.preserveInputOrder) 897 { 898 reservoirSamplingViaHeap!(Yes.isWeighted, Yes.preserveInputOrder)(cmdopt, outputStream); 899 } 900 else 901 { 902 reservoirSamplingViaHeap!(Yes.isWeighted, No.preserveInputOrder)(cmdopt, outputStream); 903 } 904 } 905 else if (cmdopt.compatibilityMode || 906 (cmdopt.sampleSize < algorithmRSampleSizeThreshold && !cmdopt.preferAlgorithmR)) 907 { 908 if (cmdopt.preserveInputOrder) 909 { 910 reservoirSamplingViaHeap!(No.isWeighted, Yes.preserveInputOrder)(cmdopt, outputStream); 911 } 912 else 913 { 914 reservoirSamplingViaHeap!(No.isWeighted, No.preserveInputOrder)(cmdopt, outputStream); 915 } 916 } 917 else if (cmdopt.preserveInputOrder) 918 { 919 reservoirSamplingAlgorithmR!(Yes.preserveInputOrder)(cmdopt, outputStream); 920 } 921 else 922 { 923 reservoirSamplingAlgorithmR!(No.preserveInputOrder)(cmdopt, outputStream); 924 } 925 } 926 927 /** Reservoir sampling using a heap. Both weighted and unweighted random sampling are 928 * supported. 929 * 930 * The algorithm used here is based on the one-pass algorithm described by Pavlos 931 * Efraimidis and Paul Spirakis ("Weighted Random Sampling over Data Streams", Pavlos S. 932 * Efraimidis, https://arxiv.org/abs/1012.0256). In the unweighted case weights are 933 * simply set to one. 934 * 935 * The implementation uses a heap (priority queue) large enough to hold the desired 936 * number of lines. Input is read line-by-line, assigned a random value, and added to 937 * the heap. The role of the heap is to identify the lines with the highest assigned 938 * random values. Once the heap is full, adding a new line means dropping the line with 939 * the lowest score. A "min" heap used for this reason. 940 * 941 * When done reading all lines, the "min" heap is in reverse of weighted selection 942 * order. Weighted selection order is obtained by removing each element one at at time 943 * from the heap. The underlying data store will have the elements in weighted selection 944 * order (largest weights first). 945 * 946 * Generating output in weighted order is useful for several reasons: 947 * - For weighted sampling, it preserves the property that smaller valid subsets can be 948 * created by taking the first N lines. 949 * - For unweighted sampling, it ensures that all output permutations are possible, and 950 * are not influenced by input order or the heap data structure used. 951 * - Order consistency is maintained when making repeated use of the same random seed, 952 * but with different sample sizes. 953 * 954 * The other choice is preserving input order. This is supporting by recording line 955 * numbers and sorting the selected sample. 956 * 957 * There are use cases where only the selection set matters. For these some performance 958 * could be gained by skipping the reordering and simply printing the backing store 959 * array in-order. Performance tests indicate only a minor benefit, so this is not 960 * supported. 961 * 962 * Notes: 963 * $(LIST 964 * * In tsv-sample versions 1.2.1 and earlier this routine also supported 965 * randomization of all input lines. This was dropped in version 1.2.2 in favor 966 * of the approach used in randomizeLines. The latter has significant advantages 967 * given that all data must be read into memory. 968 * * For large reservoir sizes better performance can be achieved using Algorithm R. 969 * See the reservoirSamplingAlgorithmR documentation for details. 970 * ) 971 */ 972 void reservoirSamplingViaHeap(Flag!"isWeighted" isWeighted, Flag!"preserveInputOrder" preserveInputOrder, OutputRange) 973 (TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 974 if (isOutputRange!(OutputRange, char)) 975 { 976 import std.algorithm : sort; 977 import std.container.array; 978 import std.container.binaryheap; 979 import std.meta : AliasSeq; 980 import std.random : Random = Mt19937, uniform01; 981 import tsv_utils.common.utils : bufferedByLine, throwIfWindowsNewlineOnUnix; 982 983 static if (isWeighted) assert(cmdopt.hasWeightField); 984 else assert(!cmdopt.hasWeightField); 985 986 assert(cmdopt.sampleSize > 0); 987 988 auto randomGenerator = Random(cmdopt.seed); 989 990 static struct Entry(Flag!"preserveInputOrder" preserveInputOrder) 991 { 992 double score; 993 const(char)[] line; 994 static if (preserveInputOrder) ulong lineNumber; 995 } 996 997 /* Create the heap and backing data store. 998 * 999 * Note: An std.container.array is used as the backing store to avoid some issues in 1000 * the standard library (Phobos) binaryheap implementation. Specifically, when an 1001 * std.container.array is used as backing store, the heap can efficiently reversed by 1002 * removing the heap elements. This leaves the backing store in the reversed order. 1003 * However, the current binaryheap implementation does not support this for all 1004 * backing stores. See: https://issues.dlang.org/show_bug.cgi?id=17094. 1005 */ 1006 1007 Array!(Entry!preserveInputOrder) dataStore; 1008 dataStore.reserve(cmdopt.sampleSize); 1009 auto reservoir = dataStore.heapify!("a.score > b.score")(0); // Min binaryheap 1010 1011 /* Process each line. */ 1012 bool headerWritten = false; 1013 static if (preserveInputOrder) ulong totalLineNum = 0; 1014 foreach (filename; cmdopt.files) 1015 { 1016 auto inputStream = (filename == "-") ? stdin : filename.File(); 1017 foreach (ulong fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1)) 1018 { 1019 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 1020 if (fileLineNum == 1 && cmdopt.hasHeader) 1021 { 1022 if (!headerWritten) 1023 { 1024 if (cmdopt.printRandom) 1025 { 1026 outputStream.put(cmdopt.randomValueHeader); 1027 outputStream.put(cmdopt.delim); 1028 } 1029 outputStream.put(line); 1030 outputStream.put("\n"); 1031 headerWritten = true; 1032 } 1033 } 1034 else 1035 { 1036 static if (!isWeighted) 1037 { 1038 immutable double lineScore = uniform01(randomGenerator); 1039 } 1040 else 1041 { 1042 immutable double lineWeight = 1043 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, filename, fileLineNum); 1044 immutable double lineScore = 1045 (lineWeight > 0.0) 1046 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 1047 : 0.0; 1048 } 1049 1050 static if (preserveInputOrder) alias entryCTArgs = AliasSeq!(totalLineNum); 1051 else alias entryCTArgs = AliasSeq!(); 1052 1053 if (reservoir.length < cmdopt.sampleSize) 1054 { 1055 reservoir.insert(Entry!preserveInputOrder(lineScore, line.dup, entryCTArgs)); 1056 } 1057 else if (reservoir.front.score < lineScore) 1058 { 1059 reservoir.replaceFront(Entry!preserveInputOrder(lineScore, line.dup, entryCTArgs)); 1060 } 1061 1062 static if (preserveInputOrder) ++totalLineNum; 1063 } 1064 } 1065 } 1066 1067 /* Done with input, all entries are in the reservoir. */ 1068 1069 /* The asserts here avoid issues with the current binaryheap implementation. They 1070 * detect use of backing stores having a length not synchronized to the reservoir. 1071 */ 1072 immutable ulong numLines = reservoir.length; 1073 assert(numLines == dataStore.length); 1074 1075 /* Update the backing store so it is in the desired output order. 1076 */ 1077 static if (preserveInputOrder) 1078 { 1079 dataStore[].sort!((a, b) => a.lineNumber < b.lineNumber); 1080 } 1081 else 1082 { 1083 /* Output in weighted selection order. The heap is in reverse order of assigned 1084 * weights. Reversing order is done by removing all elements from the heap. This 1085 * leaves the backing store in the correct order. 1086 */ 1087 while (!reservoir.empty) reservoir.removeFront; 1088 } 1089 1090 assert(numLines == dataStore.length); 1091 1092 foreach (entry; dataStore) 1093 { 1094 if (cmdopt.printRandom) 1095 { 1096 outputStream.formatRandomValue(entry.score); 1097 outputStream.put(cmdopt.delim); 1098 } 1099 outputStream.put(entry.line); 1100 outputStream.put("\n"); 1101 } 1102 } 1103 1104 /** Generate weighted random values for all input lines, preserving input order. 1105 * 1106 * This complements weighted reservoir sampling, but instead of using a reservoir it 1107 * simply iterates over the input lines generating the values. The weighted random 1108 * values are generated with the same formula used by reservoirSampling. 1109 */ 1110 void generateWeightedRandomValuesInorder(OutputRange) 1111 (TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1112 if (isOutputRange!(OutputRange, char)) 1113 { 1114 import std.random : Random = Mt19937, uniform01; 1115 import tsv_utils.common.utils : bufferedByLine, throwIfWindowsNewlineOnUnix; 1116 1117 assert(cmdopt.hasWeightField); 1118 1119 auto randomGenerator = Random(cmdopt.seed); 1120 1121 /* Process each line. */ 1122 bool headerWritten = false; 1123 ulong numLinesWritten = 0; 1124 foreach (filename; cmdopt.files) 1125 { 1126 auto inputStream = (filename == "-") ? stdin : filename.File(); 1127 foreach (ulong fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1)) 1128 { 1129 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 1130 if (fileLineNum == 1 && cmdopt.hasHeader) 1131 { 1132 if (!headerWritten) 1133 { 1134 outputStream.put(cmdopt.randomValueHeader); 1135 outputStream.put(cmdopt.delim); 1136 outputStream.put(line); 1137 outputStream.put("\n"); 1138 headerWritten = true; 1139 } 1140 } 1141 else 1142 { 1143 immutable double lineWeight = 1144 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, filename, fileLineNum); 1145 1146 immutable double lineScore = 1147 (lineWeight > 0.0) 1148 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 1149 : 0.0; 1150 1151 outputStream.formatRandomValue(lineScore); 1152 outputStream.put(cmdopt.delim); 1153 outputStream.put(line); 1154 outputStream.put("\n"); 1155 1156 if (cmdopt.sampleSize != 0) 1157 { 1158 ++numLinesWritten; 1159 if (numLinesWritten == cmdopt.sampleSize) return; 1160 } 1161 } 1162 } 1163 } 1164 } 1165 1166 /** Reservoir sampling via Algorithm R 1167 * 1168 * This is an implementation of reservoir sampling using what is commonly known as 1169 * "Algorithm R", credited to Alan Waterman by Donald Knuth in the "The Art of 1170 * Computer Programming, Volume 2: Seminumerical Algorithms". More information about 1171 * the algorithm can be found in Jeffrey Vitter's classic paper "Random Sampling with 1172 * a Reservoir" (1985) as well as the Wikipedia article "Reservoir Sampling" 1173 * (https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_R). 1174 * 1175 * Algorithm R is used for unweighted sampling without replacement. The heap-based 1176 * algorithm in reservoirSamplingViaHeap is used for weighted sampling. 1177 * 1178 * The classic algorithm stops after identifying the selected set of items. This 1179 * implementation goes one step further and randomizes the order of the selected 1180 * lines. This is consistent with shuffling (line order randomization), a primary 1181 * tsv-sample use-case. 1182 * 1183 * This algorithm is faster than reservoirSamplingViaHeap when the sample size 1184 * (reservoir size) is large. Heap insertion is O(log k), where k is the sample size. 1185 * Insertion in this algorithm is O(1). Similarly, generating the random order in the 1186 * heap is O(k * log k), while in this algorithm the final randomization step is O(k). 1187 * 1188 * This speed advantage may be offset a certain amount by using a more expensive random 1189 * value generator. reservoirSamplingViaHeap generates values between zero and one, 1190 * whereas reservoirSamplingAlgorithmR generates random integers over and ever growing 1191 * interval. The latter is expected to be more expensive. This is consistent with 1192 * performance tests indicating that reservoirSamplingViaHeap is faster when using 1193 * small-to-medium size reservoirs and large input streams. 1194 */ 1195 void reservoirSamplingAlgorithmR(Flag!"preserveInputOrder" preserveInputOrder, OutputRange) 1196 (TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1197 if (isOutputRange!(OutputRange, char)) 1198 { 1199 import std.meta : AliasSeq; 1200 import std.random : Random = Mt19937, randomShuffle, uniform; 1201 import std.algorithm : sort; 1202 import tsv_utils.common.utils : bufferedByLine, throwIfWindowsNewlineOnUnix; 1203 1204 assert(cmdopt.sampleSize > 0); 1205 assert(!cmdopt.hasWeightField); 1206 assert(!cmdopt.compatibilityMode); 1207 assert(!cmdopt.printRandom); 1208 assert(!cmdopt.genRandomInorder); 1209 1210 static struct Entry(Flag!"preserveInputOrder" preserveInputOrder) 1211 { 1212 const(char)[] line; 1213 static if (preserveInputOrder) ulong lineNumber; 1214 } 1215 1216 Entry!preserveInputOrder[] reservoir; 1217 auto reservoirAppender = appender(&reservoir); 1218 reservoirAppender.reserve(cmdopt.sampleSize); 1219 1220 auto randomGenerator = Random(cmdopt.seed); 1221 1222 /* Process each line. */ 1223 1224 bool headerWritten = false; 1225 ulong totalLineNum = 0; 1226 foreach (filename; cmdopt.files) 1227 { 1228 auto inputStream = (filename == "-") ? stdin : filename.File(); 1229 foreach (ulong fileLineNum, line; inputStream.bufferedByLine!(KeepTerminator.no).enumerate(1)) 1230 { 1231 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, filename, fileLineNum); 1232 if (fileLineNum == 1 && cmdopt.hasHeader) 1233 { 1234 if (!headerWritten) 1235 { 1236 outputStream.put(line); 1237 outputStream.put("\n"); 1238 headerWritten = true; 1239 } 1240 } 1241 else 1242 { 1243 /* Add lines to the reservoir until the reservoir is filled. 1244 * After that lines are added with decreasing likelihood, based on 1245 * the total number of lines seen. If added to the reservoir, the 1246 * line replaces a randomly chosen existing line. 1247 */ 1248 static if (preserveInputOrder) alias entryCTArgs = AliasSeq!(totalLineNum); 1249 else alias entryCTArgs = AliasSeq!(); 1250 1251 if (totalLineNum < cmdopt.sampleSize) 1252 { 1253 reservoirAppender ~= Entry!preserveInputOrder(line.idup, entryCTArgs); 1254 } 1255 else 1256 { 1257 immutable size_t i = uniform(0, totalLineNum, randomGenerator); 1258 if (i < reservoir.length) 1259 { 1260 reservoir[i] = Entry!preserveInputOrder(line.idup, entryCTArgs); 1261 } 1262 } 1263 1264 ++totalLineNum; 1265 } 1266 } 1267 } 1268 1269 /* Done with input. The sample is in the reservoir. Update the order and print. */ 1270 1271 static if (preserveInputOrder) 1272 { 1273 reservoir.sort!((a, b) => a.lineNumber < b.lineNumber); 1274 } 1275 else 1276 { 1277 reservoir.randomShuffle(randomGenerator); 1278 } 1279 1280 foreach (ref entry; reservoir) 1281 { 1282 outputStream.put(entry.line); 1283 outputStream.put("\n"); 1284 } 1285 } 1286 1287 /** Shuffling command handler. Invokes the appropriate shuffle (line order 1288 * randomization) routine based on the command line arguments. 1289 * 1290 * Shuffling has similarities to random sampling, but the algorithms used are 1291 * different. Random sampling selects a subset, only the current subset selection 1292 * needs to be kept in memory. This is supported by reservoir sampling. By contrast, 1293 * shuffling needs to hold all input in memory, so it works better to read all lines 1294 * into memory at once and then shuffle. 1295 * 1296 * Two different algorithms are used. Array shuffling is used for unweighted shuffling. 1297 * Sorting plus random weight assignments is used for weighted shuffling and when 1298 * compatibility mode is being used. 1299 * 1300 * The algorithms used here are all limited by available memory. 1301 */ 1302 void shuffleCommand(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1303 if (isOutputRange!(OutputRange, char)) 1304 { 1305 if (cmdopt.hasWeightField) 1306 { 1307 randomizeLinesViaSort!(Yes.isWeighted)(cmdopt, outputStream); 1308 } 1309 else if (cmdopt.compatibilityMode) 1310 { 1311 randomizeLinesViaSort!(No.isWeighted)(cmdopt, outputStream); 1312 } 1313 else 1314 { 1315 randomizeLinesViaShuffle(cmdopt, outputStream); 1316 } 1317 } 1318 1319 /** Shuffle all input lines by assigning random weights and sorting. 1320 * 1321 * randomizeLinesViaSort reads in all input lines and writes them out in random order. 1322 * The algorithm works by assigning a random value to each line and sorting. Both 1323 * weighted and unweighted shuffling are supported. 1324 * 1325 * Notes: 1326 * $(LIST 1327 * * For unweighted shuffling randomizeLinesViaShuffle is faster and should be used 1328 * unless compatibility mode is needed. 1329 * * This routine is significantly faster than heap-based reservoir sampling in the 1330 * case where the entire file is being read. 1331 * * Input data must be read entirely in memory. Disk oriented techniques are needed 1332 * when data sizes get too large for available memory. One option is to generate 1333 * random values for each line, e.g. --gen-random-inorder, and sort with a disk- 1334 * backed sort program like GNU sort. 1335 * ) 1336 */ 1337 void randomizeLinesViaSort(Flag!"isWeighted" isWeighted, OutputRange) 1338 (TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1339 if (isOutputRange!(OutputRange, char)) 1340 { 1341 import std.algorithm : map, sort; 1342 1343 static if (isWeighted) assert(cmdopt.hasWeightField); 1344 else assert(!cmdopt.hasWeightField); 1345 1346 assert(cmdopt.sampleSize == 0); 1347 1348 /* 1349 * Read all file data into memory. Then split the data into lines and assign a 1350 * random value to each line. identifyInputLines also writes the first header line. 1351 */ 1352 const fileData = cmdopt.files.readFileData; 1353 auto inputLines = fileData.identifyInputLines!(Yes.hasRandomValue, isWeighted)(cmdopt, outputStream); 1354 1355 /* 1356 * Sort by the weight and output the lines. 1357 */ 1358 inputLines.sort!((a, b) => a.randomValue > b.randomValue); 1359 1360 foreach (lineEntry; inputLines) 1361 { 1362 if (cmdopt.printRandom) 1363 { 1364 outputStream.formatRandomValue(lineEntry.randomValue); 1365 outputStream.put(cmdopt.delim); 1366 } 1367 outputStream.put(lineEntry.data); 1368 outputStream.put("\n"); 1369 } 1370 } 1371 1372 /** Shuffle (randomize) all input lines using a shuffling algorithm. 1373 * 1374 * All lines in files and/or standard input are read in and written out in random 1375 * order. This routine uses array shuffling, which is faster than sorting. It is a 1376 * good alternative to randomizeLinesViaSort when doing unweighted shuffling (the 1377 * most common case). 1378 * 1379 * Input data size is limited by available memory. Disk oriented techniques are needed 1380 * when data sizes are larger. For example, generating random values line-by-line (ala 1381 * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort. 1382 * 1383 * This routine does not support random value printing or compatibility-mode. 1384 */ 1385 void randomizeLinesViaShuffle(OutputRange)(TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1386 if (isOutputRange!(OutputRange, char)) 1387 { 1388 import std.algorithm : map; 1389 import std.random : Random = Mt19937, randomShuffle; 1390 1391 assert(cmdopt.sampleSize == 0); 1392 assert(!cmdopt.hasWeightField); 1393 assert(!cmdopt.printRandom); 1394 assert(!cmdopt.genRandomInorder); 1395 1396 /* 1397 * Read all file data into memory and split into lines. 1398 */ 1399 const fileData = cmdopt.files.readFileData; 1400 auto inputLines = fileData.identifyInputLines!(No.hasRandomValue, No.isWeighted)(cmdopt, outputStream); 1401 1402 /* 1403 * Randomly shuffle and print each line. 1404 * 1405 * Note: Also tried randomCover, but that was exceedingly slow. 1406 */ 1407 import std.random : randomShuffle; 1408 1409 auto randomGenerator = Random(cmdopt.seed); 1410 inputLines.randomShuffle(randomGenerator); 1411 1412 foreach (ref line; inputLines) 1413 { 1414 outputStream.put(line.data); 1415 outputStream.put("\n"); 1416 } 1417 } 1418 1419 /** Simple random sampling with replacement. 1420 * 1421 * All lines in files and/or standard input are read in. Then random lines are selected 1422 * one at a time and output. Lines can be selected multiple times. This process continues 1423 * until the desired number of samples (--n|num) has been output. Output continues 1424 * indefinitely if a sample size was not provided. 1425 */ 1426 void simpleRandomSamplingWithReplacement(OutputRange) 1427 (TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1428 if (isOutputRange!(OutputRange, char)) 1429 { 1430 import std.algorithm : map; 1431 import std.random : Random = Mt19937, uniform; 1432 1433 /* 1434 * Read all file data into memory and split the data into lines. 1435 */ 1436 const fileData = cmdopt.files.readFileData; 1437 const inputLines = fileData.identifyInputLines!(No.hasRandomValue, No.isWeighted)(cmdopt, outputStream); 1438 1439 if (inputLines.length > 0) 1440 { 1441 auto randomGenerator = Random(cmdopt.seed); 1442 1443 /* Repeat forever is sampleSize is zero, otherwise print sampleSize lines. */ 1444 size_t numLeft = (cmdopt.sampleSize == 0) ? 1 : cmdopt.sampleSize; 1445 while (numLeft != 0) 1446 { 1447 immutable size_t index = uniform(0, inputLines.length, randomGenerator); 1448 outputStream.put(inputLines[index].data); 1449 outputStream.put("\n"); 1450 if (cmdopt.sampleSize != 0) numLeft--; 1451 } 1452 } 1453 } 1454 1455 /** A container holding data read from a file or standard input. 1456 * 1457 * The InputBlock struct is used to represent a block of data read from a file or 1458 * standard input. An array of InputBlocks is returned by readFileData. Typically one 1459 * block per file. Multiple blocks are used for standard input and when the file size 1460 * cannot be determined. Individual lines are not allowed to span blocks. The blocks 1461 * allocated to an individual file are numbered starting with zero. 1462 * 1463 * See readFileData() for more information. 1464 */ 1465 static struct InputBlock 1466 { 1467 string filename; /// Original filename or path. "-" denotes standard input. 1468 size_t fileBlockNumber; /// Zero-based block number for the file. 1469 char[] data; /// The actual data. Newline terminated or last block for the file. 1470 } 1471 1472 /** Read data from one or more files. This routine is used by algorithms needing to 1473 * read all data into memory. 1474 * 1475 * readFileData reads in all data from a set of files. Data is returned as an array 1476 * of InputBlock structs. Normally one InputBlock per file, sized to match the size 1477 * of the file. Standard input is read in one or more blocks, as are files whose size 1478 * cannot be determined. Multiple blocks are used in these last two cases to avoid 1479 * expensive memory reallocations. This is not necessary when file size is known as 1480 * the necessary memory can be preallocated. 1481 * 1482 * Individual lines never span multiple blocks, and newlines are preserved. This 1483 * means that each block starts at the beginning of a line and ends with a newline 1484 * unless the end of a file has been reached. Each file gets its own block so that 1485 * header processing can be done. 1486 */ 1487 InputBlock[] readFileData(const string[] files) 1488 { 1489 import std.algorithm : find, min; 1490 import std.range : retro; 1491 1492 enum BlockSize = 1024L * 1024L * 1024L; // 1 GB. ('L' notation avoids overflow w/ 2GB+ sizes.) 1493 enum ReadSize = 1024L * 128L; 1494 enum NewlineSearchSize = 1024L * 16L; 1495 1496 InputBlock[] blocks; 1497 auto blocksAppender = appender(&blocks); 1498 blocksAppender.reserve(files.length); // At least one block per file. 1499 1500 ubyte[] rawReadBuffer = new ubyte[ReadSize]; 1501 1502 foreach (filename; files) 1503 { 1504 /* If the file size can be determined then read it as a single block. 1505 * Otherwise read as multiple blocks. File.size() returns ulong.max 1506 * if file size cannot be determined, so we'll combine that check 1507 * with the standard input case. 1508 */ 1509 1510 auto ifile = (filename == "-") ? stdin : filename.File; 1511 immutable ulong filesize = (filename == "-") ? ulong.max : ifile.size; 1512 1513 if (filesize != ulong.max) 1514 { 1515 readFileDataAsOneBlock(filename, ifile, filesize, blocksAppender, rawReadBuffer); 1516 } 1517 else 1518 { 1519 readFileDataAsMultipleBlocks(filename, ifile, blocksAppender, rawReadBuffer, 1520 BlockSize, NewlineSearchSize); 1521 } 1522 } 1523 return blocks; 1524 } 1525 1526 /* readFileData() helper function. Read data from a File handle as a single block. The 1527 * new block is appended to an existing InputBlock[] array. 1528 * 1529 * readFileDataAsOneBlocks is part of the readFileData logic. It handles the case 1530 * where a file is being read as a single block. Normally initialBlockSize is passed 1531 * as the size of the file. 1532 * 1533 * This routine has been separated out to enable unit testing. At present it is not 1534 * intended as a general API. See readFileData for more info. 1535 */ 1536 private void readFileDataAsOneBlock( 1537 string filename, 1538 ref File ifile, 1539 const ulong initialBlockSize, 1540 ref RefAppender!(InputBlock[]) blocksAppender, 1541 ref ubyte[] rawReadBuffer) 1542 { 1543 blocksAppender.put(InputBlock(filename, 0)); 1544 auto dataAppender = appender(&(blocksAppender.data[$-1].data)); 1545 dataAppender.reserve(initialBlockSize); 1546 1547 foreach (ref ubyte[] buffer; ifile.byChunk(rawReadBuffer)) 1548 { 1549 dataAppender.put(cast(char[]) buffer); 1550 } 1551 } 1552 1553 /* readFileData() helper function. Read data from a File handle as one or more blocks. 1554 * Blocks are appended to an existing InputBlock[] array. 1555 * 1556 * readFileDataAsMultipleBlocks is part of the readFileData logic. It handles the case 1557 * where a file or standard input is being read as a series of blocks. This is the 1558 * standard approach for standard input, but also applies when the file size cannot be 1559 * determined. 1560 * 1561 * This routine has been separated out to enable unit testing. At present it is not 1562 * intended as a general API. See readFileData for more info. 1563 */ 1564 private void readFileDataAsMultipleBlocks( 1565 string filename, 1566 ref File ifile, 1567 ref RefAppender!(InputBlock[]) blocksAppender, 1568 ref ubyte[] rawReadBuffer, 1569 const size_t blockSize, 1570 const size_t newlineSearchSize) 1571 { 1572 import std.algorithm : find, min; 1573 import std.range : retro; 1574 1575 assert(ifile.isOpen); 1576 1577 /* Create a new block for the file and an Appender for writing data. 1578 */ 1579 blocksAppender.put(InputBlock(filename, 0)); 1580 auto dataAppender = appender(&(blocksAppender.data[$-1].data)); 1581 dataAppender.reserve(blockSize); 1582 size_t blockNumber = 0; 1583 1584 /* Read all the data and copy it to an InputBlock. */ 1585 foreach (ref ubyte[] buffer; ifile.byChunk(rawReadBuffer)) 1586 { 1587 assert(blockNumber == blocksAppender.data[$-1].fileBlockNumber); 1588 1589 immutable size_t remainingCapacity = dataAppender.capacity - dataAppender.data.length; 1590 1591 if (buffer.length <= remainingCapacity) 1592 { 1593 dataAppender.put(cast(char[]) buffer); 1594 } 1595 else 1596 { 1597 /* Look for the last newline in the input buffer that fits in remaining 1598 * capacity of the block. 1599 */ 1600 auto searchRegion = buffer[0 .. remainingCapacity]; 1601 auto appendRegion = searchRegion.retro.find('\n').source; 1602 1603 if (appendRegion.length > 0) 1604 { 1605 /* Copy the first part of the read buffer to the block. */ 1606 dataAppender.put(cast(char[]) appendRegion); 1607 1608 /* Create a new InputBlock and copy the remaining data to it. */ 1609 blockNumber++; 1610 blocksAppender.put(InputBlock(filename, blockNumber)); 1611 dataAppender = appender(&(blocksAppender.data[$-1].data)); 1612 dataAppender.reserve(blockSize); 1613 dataAppender.put(cast(char[]) buffer[appendRegion.length .. $]); 1614 1615 assert(blocksAppender.data.length >= 2); 1616 assert(blocksAppender.data[$-2].data[$-1] == '\n'); 1617 } 1618 else 1619 { 1620 /* Search backward in the current block for a newline. If found, it 1621 * becomes the last newline in the current block. Anything following 1622 * it is moved to the block. If a newline is not found, simply append 1623 * to the current block and let it grow. We'll only search backward 1624 * so far. 1625 */ 1626 immutable size_t currBlockLength = blocksAppender.data[$-1].data.length; 1627 immutable size_t searchLength = min(currBlockLength, newlineSearchSize); 1628 immutable size_t searchStart = currBlockLength - searchLength; 1629 auto blockSearchRegion = blocksAppender.data[$-1].data[searchStart .. $]; 1630 auto lastNewlineOffset = blockSearchRegion.retro.find('\n').source.length; 1631 1632 if (lastNewlineOffset != 0) 1633 { 1634 /* Create a new InputBlock. The previous InputBlock is then found 1635 * at blocksAppender.data[$-2]. It may be a physically different 1636 * struct (a copy) if the blocks array gets reallocated. 1637 */ 1638 blockNumber++; 1639 blocksAppender.put(InputBlock(filename, blockNumber)); 1640 dataAppender = appender(&(blocksAppender.data[$-1].data)); 1641 dataAppender.reserve(blockSize); 1642 1643 /* Copy data following the newline from the last block to the new 1644 * block. Then append the current read buffer. 1645 */ 1646 immutable size_t moveRegionStart = searchStart + lastNewlineOffset; 1647 dataAppender.put(blocksAppender.data[$-2].data[moveRegionStart .. $]); 1648 dataAppender.put(cast(char[]) buffer); 1649 1650 /* Now delete the moved region from the last block. */ 1651 blocksAppender.data[$-2].data.length = moveRegionStart; 1652 1653 assert(blocksAppender.data.length >= 2); 1654 assert(blocksAppender.data[$-2].data[$-1] == '\n'); 1655 } 1656 else 1657 { 1658 /* Give up. Allow the current block to grow. */ 1659 dataAppender.put(cast(char[]) buffer); 1660 } 1661 } 1662 } 1663 } 1664 } 1665 1666 /** HasRandomValue is a boolean flag used at compile time by identifyInputLines to 1667 * distinguish use cases needing random value assignments from those that don't. 1668 */ 1669 alias HasRandomValue = Flag!"hasRandomValue"; 1670 1671 /** An InputLine array is returned by identifyInputLines to represent each non-header line 1672 * line found in a FileData array. The 'data' element contains the line. A 'randomValue' 1673 * line is included if random values are being generated. 1674 */ 1675 static struct InputLine(HasRandomValue hasRandomValue) 1676 { 1677 const(char)[] data; 1678 static if (hasRandomValue) double randomValue; 1679 } 1680 1681 /** identifyInputLines is used by algorithms that read all files into memory prior to 1682 * processing. It does the initial processing of the file data. 1683 * 1684 * Three primary tasks are performed. One is splitting all input data into lines. The 1685 * second is writing the header line from the first file to the output stream. Header 1686 * lines from subsequent files are ignored. Third is assigning a random value to the 1687 * line, if random values are being generated. 1688 * 1689 * The key input is an InputBlock array. Normally one block for each file, but standard 1690 * input may have multiple blocks. 1691 * 1692 * The return value is an array of InputLine structs. The struct will have a 'randomValue' 1693 * member if random values are being assigned. 1694 */ 1695 InputLine!hasRandomValue[] identifyInputLines(HasRandomValue hasRandomValue, Flag!"isWeighted" isWeighted, OutputRange) 1696 (const ref InputBlock[] inputBlocks, TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1697 if (isOutputRange!(OutputRange, char)) 1698 { 1699 import std.algorithm : splitter; 1700 import std.array : appender; 1701 import std.random : Random = Mt19937, uniform01; 1702 import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix; 1703 1704 static assert(hasRandomValue || !isWeighted); 1705 static if(!hasRandomValue) assert(!cmdopt.printRandom); 1706 1707 InputLine!hasRandomValue[] inputLines; 1708 1709 auto linesAppender = appender(&inputLines); 1710 static if (hasRandomValue) auto randomGenerator = Random(cmdopt.seed); 1711 bool headerWritten = false; 1712 size_t fileLineNum; 1713 1714 foreach (block; inputBlocks) 1715 { 1716 /* Drop the last newline to avoid adding an extra empty line. */ 1717 const data = (block.data.length > 0 && block.data[$-1] == '\n') ? 1718 block.data[0 .. $-1] : block.data; 1719 1720 if (block.fileBlockNumber == 0) fileLineNum = 0; 1721 1722 foreach (ref line; data.splitter('\n')) 1723 { 1724 fileLineNum++; 1725 1726 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, block.filename, fileLineNum); 1727 if (fileLineNum == 1 && cmdopt.hasHeader) 1728 { 1729 if (!headerWritten) 1730 { 1731 if (cmdopt.printRandom) 1732 { 1733 outputStream.put(cmdopt.randomValueHeader); 1734 outputStream.put(cmdopt.delim); 1735 } 1736 outputStream.put(line); 1737 outputStream.put("\n"); 1738 headerWritten = true; 1739 } 1740 } 1741 else 1742 { 1743 static if (!hasRandomValue) 1744 { 1745 linesAppender.put(InputLine!hasRandomValue(line)); 1746 } 1747 else 1748 { 1749 static if (!isWeighted) 1750 { 1751 immutable double randomValue = uniform01(randomGenerator); 1752 } 1753 else 1754 { 1755 immutable double lineWeight = 1756 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, 1757 block.filename, fileLineNum); 1758 immutable double randomValue = 1759 (lineWeight > 0.0) 1760 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 1761 : 0.0; 1762 } 1763 1764 linesAppender.put(InputLine!hasRandomValue(line, randomValue)); 1765 } 1766 } 1767 } 1768 } 1769 1770 return inputLines; 1771 } 1772 1773 /* Unit tests for ReadFileData. These tests focus on multiple InputBlock scenarios. 1774 * Other use paths are well tested by the tests at the end cases. 1775 */ 1776 unittest 1777 { 1778 import tsv_utils.common.unittest_utils; 1779 import std.algorithm : equal, find, joiner, splitter; 1780 import std.array : appender; 1781 import std.file : rmdirRecurse; 1782 import std.format : format; 1783 import std.path : buildPath; 1784 import std.range : repeat; 1785 1786 auto rfdTestDir = makeUnittestTempDir("tsv_sample_readFileData"); 1787 scope(exit) rfdTestDir.rmdirRecurse; 1788 1789 char[] file1Data; 1790 char[] file2Data; 1791 char[] file3Data; 1792 1793 auto app1 = appender(&file1Data); 1794 auto app2 = appender(&file2Data); 1795 auto app3 = appender(&file3Data); 1796 1797 /* File 1: 1000 short lines. */ 1798 app1.put("\n".repeat(100).joiner); 1799 app1.put("x\n".repeat(100).joiner); 1800 app1.put("yz\n".repeat(100).joiner); 1801 app1.put("pqr\n".repeat(100).joiner); 1802 app1.put("a\nbc\ndef\n".repeat(100).joiner); 1803 app1.put('\n'.repeat(100)); 1804 app1.put("z\n".repeat(100).joiner); 1805 app1.put("xy\n".repeat(100).joiner); 1806 1807 /* File 2: 500 longer lines. */ 1808 app2.put( 1809 "0123456789-abcdefghijklmnopqrstuvwxyz-0123456789abcdefghijklmnopqrstuvwxyz-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ-\n" 1810 .repeat(100) 1811 .joiner); 1812 app2.put( 1813 "|abcdefghijklmnopqrstuv|\n|0123456789|\n|0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ|\n|abcdefghijklmnopqrstuvwxyz|\n" 1814 .repeat(100) 1815 .joiner); 1816 app2.put( 1817 "0123456789-abcdefghijklmnopqrstuvwxyz-0123456789abcdefghijklmnopqrstuvwxyz-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ-\n" 1818 .repeat(100) 1819 .joiner); 1820 1821 /* File 3: 1000 mixed length lines. */ 1822 app3.put("\n\n|abcde|\n1\n12\n123\n|abcdefghijklmnop|\n|xyz|\n0123456789\nX\n".repeat(100).joiner); 1823 1824 string file1Path = buildPath(rfdTestDir, "file1.txt"); 1825 string file2Path = buildPath(rfdTestDir, "file2.txt"); 1826 string file3Path = buildPath(rfdTestDir, "file3.txt"); 1827 1828 try 1829 { 1830 auto ofile1 = File(file1Path, "w"); 1831 ofile1.write(file1Data); 1832 } 1833 catch (Exception e) assert(false, format("Failed to write file: %s.\n Error: %s", file1Path, e.msg)); 1834 1835 try 1836 { 1837 auto ofile2 = File(file2Path, "w"); 1838 ofile2.write(file2Data); 1839 } 1840 catch (Exception e) assert(false, format("Failed to write file: %s.\n Error: %s", file2Path, e.msg)); 1841 1842 try 1843 { 1844 auto ofile3 = File(file3Path, "w"); 1845 ofile3.write(file3Data); 1846 } 1847 catch (Exception e) assert(false, format("Failed to write file: %s.\n Error: %s", file3Path, e.msg)); 1848 1849 auto allData = file1Data ~ file2Data ~ file3Data; 1850 auto expectedLines = allData.splitter('\n').array[0 .. $-1]; 1851 1852 auto file2DataNoHeader = (file2Data.find('\n'))[1 .. $]; 1853 auto file3DataNoHeader = (file3Data.find('\n'))[1 .. $]; 1854 auto allDataUsingHeader = file1Data ~ file2DataNoHeader ~ file3DataNoHeader; 1855 auto expectedLinesUsingHeader = allDataUsingHeader.splitter('\n').array[0 .. $-1]; 1856 1857 assert(expectedLines.length == expectedLinesUsingHeader.length + 2); 1858 1859 TsvSampleOptions cmdoptNoHeader; 1860 auto noHeaderCmdArgs = ["unittest"]; 1861 auto r1 = cmdoptNoHeader.processArgs(noHeaderCmdArgs); 1862 assert(r1[0], format("Invalid command lines arg: '%s'.", noHeaderCmdArgs)); 1863 1864 TsvSampleOptions cmdoptYesHeader; 1865 auto yesHeaderCmdArgs = ["unittest", "--header"]; 1866 auto r2 = cmdoptYesHeader.processArgs(yesHeaderCmdArgs); 1867 assert(r2[0], format("Invalid command lines arg: '%s'.", yesHeaderCmdArgs)); 1868 1869 auto outputStream = appender!(char[])(); 1870 1871 { 1872 /* Reading as single blocks. */ 1873 ubyte[] rawReadBuffer = new ubyte[256]; 1874 InputBlock[] blocks; 1875 auto blocksAppender = appender(&blocks); 1876 blocksAppender.reserve(3); 1877 foreach (f; [ file1Path, file2Path, file3Path ]) 1878 { 1879 auto ifile = f.File; 1880 ulong filesize = ifile.size; 1881 if (filesize == ulong.max) filesize = 1000; 1882 readFileDataAsOneBlock(f, ifile, filesize, blocksAppender, rawReadBuffer); 1883 } 1884 auto inputLines = 1885 identifyInputLines!(No.hasRandomValue, No.isWeighted)( 1886 blocks, cmdoptNoHeader, outputStream); 1887 1888 assert(equal!((a, b) => a.data == b)(inputLines, expectedLines)); 1889 } 1890 1891 { 1892 /* Reading as multiple blocks. */ 1893 foreach (size_t searchSize; [ 0, 1, 2, 64 ]) 1894 { 1895 foreach (size_t blockSize; [ 1, 2, 16, 64, 256 ]) 1896 { 1897 foreach (size_t readSize; [ 1, 2, 8, 32 ]) 1898 { 1899 ubyte[] rawReadBuffer = new ubyte[readSize]; 1900 InputBlock[] blocks; 1901 auto blocksAppender = appender(&blocks); 1902 blocksAppender.reserve(3); 1903 foreach (f; [ file1Path, file2Path, file3Path ]) 1904 { 1905 auto ifile = f.File; 1906 readFileDataAsMultipleBlocks(f, ifile, blocksAppender, 1907 rawReadBuffer, blockSize, searchSize); 1908 } 1909 auto inputLines = 1910 identifyInputLines!(No.hasRandomValue, No.isWeighted)( 1911 blocks, cmdoptNoHeader, outputStream); 1912 1913 assert(equal!((a, b) => a.data == b)(inputLines, expectedLines)); 1914 } 1915 } 1916 } 1917 } 1918 { 1919 /* Reading as multiple blocks, with header processing. */ 1920 const size_t readSize = 32; 1921 const size_t blockSize = 48; 1922 const size_t searchSize = 16; 1923 1924 ubyte[] rawReadBuffer = new ubyte[readSize]; 1925 InputBlock[] blocks; 1926 auto blocksAppender = appender(&blocks); 1927 blocksAppender.reserve(3); 1928 foreach (f; [ file1Path, file2Path, file3Path ]) 1929 { 1930 auto ifile = f.File; 1931 readFileDataAsMultipleBlocks(f, ifile, blocksAppender, 1932 rawReadBuffer, blockSize, searchSize); 1933 } 1934 auto inputLines = 1935 identifyInputLines!(No.hasRandomValue, No.isWeighted)( 1936 blocks, cmdoptYesHeader, outputStream); 1937 1938 assert(outputStream.data == expectedLinesUsingHeader[0] ~ '\n'); 1939 assert(equal!((a, b) => a.data == b)(inputLines, expectedLinesUsingHeader[1 .. $])); 1940 } 1941 } 1942 1943 /** Write a floating point random value to an output stream. 1944 * 1945 * This routine is used for floating point random value printing. This routine writes 1946 * 17 significant digits, the range available in doubles. This routine prefers decimal 1947 * format, without exponents. It will generate somewhat large precision numbers, 1948 * currently up to 28 digits, before switching to exponents. 1949 * 1950 * The primary reason for this approach is to enable faster sorting on random values 1951 * by GNU sort and similar external sorting programs. GNU sort is dramatically faster 1952 * on decimal format numeric sorts ('n' switch) than general numeric sorts ('g' switch). 1953 * The 'general numeric' handles exponential notation. The difference is 5-10x. 1954 * 1955 * Random values generated by Bernoulli sampling are nearly always greater than 1e-12. 1956 * No examples less than 1e-09 were seen in hundred of millions of trials. Similar 1957 * results were seen with weighted sampling with integer weights. The same is not true 1958 * with floating point weights. These produce quite large exponents. However, even 1959 * for floating point weights this can be useful. For random weights [0,1] less than 5% 1960 * will be less than 1e-12 and use exponential notation. 1961 */ 1962 void formatRandomValue(OutputRange)(auto ref OutputRange outputStream, double value) 1963 if (isOutputRange!(OutputRange, char)) 1964 { 1965 import std.format : formatValue, singleSpec; 1966 1967 immutable spec17f = singleSpec("%.17f"); 1968 immutable spec18f = singleSpec("%.18f"); 1969 immutable spec19f = singleSpec("%.19f"); 1970 immutable spec20f = singleSpec("%.20f"); 1971 immutable spec21f = singleSpec("%.21f"); 1972 immutable spec22f = singleSpec("%.22f"); 1973 immutable spec23f = singleSpec("%.23f"); 1974 immutable spec24f = singleSpec("%.24f"); 1975 immutable spec25f = singleSpec("%.25f"); 1976 immutable spec26f = singleSpec("%.26f"); 1977 immutable spec27f = singleSpec("%.27f"); 1978 immutable spec28f = singleSpec("%.28f"); 1979 1980 immutable spec17g = singleSpec("%.17g"); 1981 1982 immutable formatSpec = 1983 (value >= 1e-01) ? spec17f : 1984 (value >= 1e-02) ? spec18f : 1985 (value >= 1e-03) ? spec19f : 1986 (value >= 1e-04) ? spec20f : 1987 (value >= 1e-05) ? spec21f : 1988 (value >= 1e-06) ? spec22f : 1989 (value >= 1e-07) ? spec23f : 1990 (value >= 1e-08) ? spec24f : 1991 (value >= 1e-09) ? spec25f : 1992 (value >= 1e-10) ? spec26f : 1993 (value >= 1e-11) ? spec27f : 1994 (value >= 1e-12) ? spec28f : spec17g; 1995 1996 outputStream.formatValue(value, formatSpec); 1997 } 1998 1999 @safe unittest 2000 { 2001 void testFormatValue(double value, string expected) 2002 { 2003 import std.array : appender; 2004 import std.format : format; 2005 2006 auto s = appender!string(); 2007 s.formatRandomValue(value); 2008 assert(s.data == expected, 2009 format("[testFormatValue] value: %g; expected: %s; actual: %s", value, expected, s.data)); 2010 } 2011 2012 testFormatValue(1.0, "1.00000000000000000"); 2013 testFormatValue(0.1, "0.10000000000000001"); 2014 testFormatValue(0.01, "0.010000000000000000"); 2015 testFormatValue(1e-03, "0.0010000000000000000"); 2016 testFormatValue(1e-04, "0.00010000000000000000"); 2017 testFormatValue(1e-05, "0.000010000000000000001"); 2018 testFormatValue(1e-06, "0.0000010000000000000000"); 2019 testFormatValue(1e-07, "0.00000010000000000000000"); 2020 testFormatValue(1e-08, "0.000000010000000000000000"); 2021 testFormatValue(1e-09, "0.0000000010000000000000001"); 2022 testFormatValue(1e-10, "0.00000000010000000000000000"); 2023 testFormatValue(1e-11, "0.000000000009999999999999999"); 2024 testFormatValue(1e-12, "0.0000000000010000000000000000"); 2025 testFormatValue(1e-13, "1e-13"); 2026 testFormatValue(1e-14, "1e-14"); 2027 testFormatValue(12345678901234567e-15, "12.34567890123456735"); 2028 testFormatValue(12345678901234567e-16, "1.23456789012345669"); 2029 testFormatValue(12345678901234567e-17, "0.12345678901234566"); 2030 testFormatValue(12345678901234567e-18, "0.012345678901234567"); 2031 testFormatValue(12345678901234567e-19, "0.0012345678901234567"); 2032 testFormatValue(12345678901234567e-20, "0.00012345678901234567"); 2033 testFormatValue(12345678901234567e-21, "0.000012345678901234568"); 2034 testFormatValue(12345678901234567e-22, "0.0000012345678901234567"); 2035 testFormatValue(12345678901234567e-23, "0.00000012345678901234566"); 2036 testFormatValue(12345678901234567e-24, "0.000000012345678901234567"); 2037 testFormatValue(12345678901234567e-25, "0.0000000012345678901234566"); 2038 testFormatValue(12345678901234567e-26, "0.00000000012345678901234568"); 2039 testFormatValue(12345678901234567e-27, "0.000000000012345678901234567"); 2040 testFormatValue(12345678901234567e-28, "0.0000000000012345678901234567"); 2041 testFormatValue(12345678901234567e-29, "1.2345678901234566e-13"); 2042 } 2043 2044 2045 /** Convenience function for extracting a single field from a line. See 2046 * [tsv_utils.common.utils.getTsvFieldValue] for details. This wrapper creates error 2047 * text tailored for this program. 2048 */ 2049 import std.traits : isSomeChar; 2050 T getFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim, string filename, ulong lineNum) pure @safe 2051 if (isSomeChar!C) 2052 { 2053 import std.conv : ConvException, to; 2054 import std.format : format; 2055 import tsv_utils.common.utils : getTsvFieldValue; 2056 2057 T val; 2058 try 2059 { 2060 val = getTsvFieldValue!T(line, fieldIndex, delim); 2061 } 2062 catch (ConvException exc) 2063 { 2064 throw new Exception( 2065 format("Could not process line: %s\n File: %s Line: %s%s", 2066 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum, 2067 (lineNum == 1) ? "\n Is this a header line? Use --H|header to skip." : "")); 2068 } 2069 catch (Exception exc) 2070 { 2071 /* Not enough fields on the line. */ 2072 throw new Exception( 2073 format("Could not process line: %s\n File: %s Line: %s", 2074 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum)); 2075 } 2076 2077 return val; 2078 } 2079 2080 @safe unittest 2081 { 2082 /* getFieldValue unit tests. getTsvFieldValue has it's own tests. 2083 * These tests make basic sanity checks on the getFieldValue wrapper. 2084 */ 2085 import std.exception; 2086 2087 assert(getFieldValue!double("123", 0, '\t', "unittest", 1) == 123); 2088 assert(getFieldValue!double("123.4", 0, '\t', "unittest", 1) == 123.4); 2089 assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 1)); 2090 assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 2)); 2091 assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 1)); 2092 assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 2)); 2093 } 2094 2095 /* Unit tests for the main program start here. 2096 * 2097 * Portability note: Many of the tests here rely on generating consistent random numbers 2098 * across different platforms when using the same random seed. So far this has succeeded 2099 * on several different platform, compiler, and library versions. However, it is certainly 2100 * possible this condition will not hold on other platforms. 2101 * 2102 * For tsv-sample, this portability implies generating the same results on different 2103 * platforms when using the same random seed. This is NOT part of tsv-sample guarantees, 2104 * but it is convenient for testing. If platforms are identified that do not generate 2105 * the same results these tests will need to be adjusted. 2106 */ 2107 version(unittest) 2108 { 2109 /* Unit test helper functions. */ 2110 2111 import tsv_utils.common.unittest_utils; // tsv unit test helpers, from common/src/. 2112 import std.conv : to; 2113 2114 void testTsvSample(string[] cmdArgs, string[][] expected) 2115 { 2116 import std.array : appender; 2117 import std.format : format; 2118 2119 assert(cmdArgs.length > 0, "[testTsvSample] cmdArgs must not be empty."); 2120 2121 auto formatAssertMessage(T...)(string msg, T formatArgs) 2122 { 2123 auto formatString = "[testTsvSample] %s: " ~ msg; 2124 return format(formatString, cmdArgs[0], formatArgs); 2125 } 2126 2127 TsvSampleOptions cmdopt; 2128 auto savedCmdArgs = cmdArgs.to!string; 2129 auto r = cmdopt.processArgs(cmdArgs); 2130 assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); 2131 auto output = appender!(char[])(); 2132 2133 tsvSample(cmdopt, output); // This invokes the main code line. 2134 2135 auto expectedOutput = expected.tsvDataToString; 2136 2137 assert(output.data == expectedOutput, 2138 formatAssertMessage( 2139 "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================", 2140 expectedOutput.to!string, output.data.to!string)); 2141 } 2142 } 2143 2144 unittest 2145 { 2146 import std.path : buildPath; 2147 import std.file : rmdirRecurse; 2148 import std.format : format; 2149 2150 auto testDir = makeUnittestTempDir("tsv_sample"); 2151 scope(exit) testDir.rmdirRecurse; 2152 2153 /* Tabular data sets and expected results use the built-in static seed. 2154 * Tests are run by writing the data set to a file, then calling the main 2155 * routine to process. The function testTsvSample plays the role of the 2156 * main program. Rather than writing to expected output, the results are 2157 * matched against expected. The expected results were verified by hand 2158 * prior to inclusion in the test. 2159 * 2160 * The initial part of this section is simply setting up data files and 2161 * expected results. 2162 * 2163 * Expected results naming conventions: 2164 * - Prefix: dataNxMExpected. N and M are numbers. e.g. data3x6Expected 2165 * - Sampling Type (required): Permute (Shuffle), Sample, Replace, Bernoulli, Distinct 2166 * - Compatibility: Compat, AlgoR, Skip, Swap, Inorder 2167 * - Weight Field: Wt<num>, e.g. Wt3 2168 * - Sample Size: Num<num>, eg. Num3 2169 * - Seed Value: V<num>, eg. V77 2170 * - Key Field: K<num>, e.g. K2 2171 * - Probability: P<num>, e.g P05 (5%) 2172 * - Printing Probabilities: Probs 2173 * - Printing Probs in order: ProbsInorder 2174 * - Printing Probs with custom header: RVCustom 2175 */ 2176 2177 /* Empty file. */ 2178 string[][] dataEmpty = []; 2179 string fpath_dataEmpty = buildPath(testDir, "dataEmpty.tsv"); 2180 writeUnittestTsvFile(fpath_dataEmpty, dataEmpty); 2181 2182 /* 3x1, header only. */ 2183 string[][] data3x0 = [["field_a", "field_b", "field_c"]]; 2184 string fpath_data3x0 = buildPath(testDir, "data3x0.tsv"); 2185 writeUnittestTsvFile(fpath_data3x0, data3x0); 2186 2187 /* 3x1 */ 2188 string[][] data3x1 = 2189 [["field_a", "field_b", "field_c"], 2190 ["tan", "タン", "8.5"]]; 2191 2192 string fpath_data3x1 = buildPath(testDir, "data3x1.tsv"); 2193 string fpath_data3x1_noheader = buildPath(testDir, "data3x1_noheader.tsv"); 2194 writeUnittestTsvFile(fpath_data3x1, data3x1); 2195 writeUnittestTsvFile(fpath_data3x1_noheader, data3x1[1 .. $]); 2196 2197 string[][] data3x1ExpectedReplaceNum3 = 2198 [["field_a", "field_b", "field_c"], 2199 ["tan", "タン", "8.5"], 2200 ["tan", "タン", "8.5"], 2201 ["tan", "タン", "8.5"]]; 2202 2203 /* 3x2 */ 2204 string[][] data3x2 = 2205 [["field_a", "field_b", "field_c"], 2206 ["brown", "褐色", "29.2"], 2207 ["gray", "グレー", "6.2"]]; 2208 2209 string fpath_data3x2 = buildPath(testDir, "data3x2.tsv"); 2210 string fpath_data3x2_noheader = buildPath(testDir, "data3x2_noheader.tsv"); 2211 writeUnittestTsvFile(fpath_data3x2, data3x2); 2212 writeUnittestTsvFile(fpath_data3x2_noheader, data3x2[1 .. $]); 2213 2214 string[][] data3x2PermuteCompat = 2215 [["field_a", "field_b", "field_c"], 2216 ["gray", "グレー", "6.2"], 2217 ["brown", "褐色", "29.2"]]; 2218 2219 string[][] data3x2PermuteShuffle = 2220 [["field_a", "field_b", "field_c"], 2221 ["gray", "グレー", "6.2"], 2222 ["brown", "褐色", "29.2"]]; 2223 2224 /* 3x3 */ 2225 string[][] data3x3 = 2226 [["field_a", "field_b", "field_c"], 2227 ["orange", "オレンジ", "2.5"], 2228 ["pink", "ピンク", "1.1"], 2229 ["purple", "紫の", "42"]]; 2230 2231 string fpath_data3x3 = buildPath(testDir, "data3x3.tsv"); 2232 string fpath_data3x3_noheader = buildPath(testDir, "data3x3_noheader.tsv"); 2233 writeUnittestTsvFile(fpath_data3x3, data3x3); 2234 writeUnittestTsvFile(fpath_data3x3_noheader, data3x3[1 .. $]); 2235 2236 string[][] data3x3ExpectedPermuteCompat = 2237 [["field_a", "field_b", "field_c"], 2238 ["purple", "紫の", "42"], 2239 ["pink", "ピンク", "1.1"], 2240 ["orange", "オレンジ", "2.5"]]; 2241 2242 string[][] data3x3ExpectedPermuteSwap = 2243 [["field_a", "field_b", "field_c"], 2244 ["purple", "紫の", "42"], 2245 ["orange", "オレンジ", "2.5"], 2246 ["pink", "ピンク", "1.1"]]; 2247 2248 /* 3x6 */ 2249 string[][] data3x6 = 2250 [["field_a", "field_b", "field_c"], 2251 ["red", "赤", "23.8"], 2252 ["green", "緑", "0.0072"], 2253 ["white", "白", "1.65"], 2254 ["yellow", "黄", "12"], 2255 ["blue", "青", "12"], 2256 ["black", "黒", "0.983"]]; 2257 string fpath_data3x6 = buildPath(testDir, "data3x6.tsv"); 2258 string fpath_data3x6_noheader = buildPath(testDir, "data3x6_noheader.tsv"); 2259 writeUnittestTsvFile(fpath_data3x6, data3x6); 2260 writeUnittestTsvFile(fpath_data3x6_noheader, data3x6[1 .. $]); 2261 2262 // Randomization, all lines 2263 string[][] data3x6ExpectedPermuteCompat = 2264 [["field_a", "field_b", "field_c"], 2265 ["yellow", "黄", "12"], 2266 ["black", "黒", "0.983"], 2267 ["blue", "青", "12"], 2268 ["white", "白", "1.65"], 2269 ["green", "緑", "0.0072"], 2270 ["red", "赤", "23.8"]]; 2271 2272 string[][] data3x6ExpectedPermuteSwap = 2273 [["field_a", "field_b", "field_c"], 2274 ["black", "黒", "0.983"], 2275 ["green", "緑", "0.0072"], 2276 ["red", "赤", "23.8"], 2277 ["yellow", "黄", "12"], 2278 ["white", "白", "1.65"], 2279 ["blue", "青", "12"]]; 2280 2281 string[][] data3x6ExpectedPermuteCompatProbs = 2282 [["random_value", "field_a", "field_b", "field_c"], 2283 ["0.96055546286515892", "yellow", "黄", "12"], 2284 ["0.75710153928957880", "black", "黒", "0.983"], 2285 ["0.52525980887003243", "blue", "青", "12"], 2286 ["0.49287854949943721", "white", "白", "1.65"], 2287 ["0.15929344086907804", "green", "緑", "0.0072"], 2288 ["0.010968807619065046", "red", "赤", "23.8"]]; 2289 2290 /* Note: data3x6ExpectedSampleAlgoRNum6 is identical to data3x6ExpectedPermuteSwap because 2291 * both are effectively the same algorithm given that --num is data length. Both read 2292 * in the full data in order then call randomShuffle. 2293 */ 2294 string[][] data3x6ExpectedSampleAlgoRNum6 = 2295 [["field_a", "field_b", "field_c"], 2296 ["black", "黒", "0.983"], 2297 ["green", "緑", "0.0072"], 2298 ["red", "赤", "23.8"], 2299 ["yellow", "黄", "12"], 2300 ["white", "白", "1.65"], 2301 ["blue", "青", "12"]]; 2302 2303 string[][] data3x6ExpectedSampleAlgoRNum5 = 2304 [["field_a", "field_b", "field_c"], 2305 ["red", "赤", "23.8"], 2306 ["black", "黒", "0.983"], 2307 ["white", "白", "1.65"], 2308 ["green", "緑", "0.0072"], 2309 ["yellow", "黄", "12"]]; 2310 2311 string[][] data3x6ExpectedSampleAlgoRNum4 = 2312 [["field_a", "field_b", "field_c"], 2313 ["blue", "青", "12"], 2314 ["green", "緑", "0.0072"], 2315 ["black", "黒", "0.983"], 2316 ["white", "白", "1.65"]]; 2317 2318 string[][] data3x6ExpectedSampleAlgoRNum3 = 2319 [["field_a", "field_b", "field_c"], 2320 ["red", "赤", "23.8"], 2321 ["black", "黒", "0.983"], 2322 ["green", "緑", "0.0072"]]; 2323 2324 string[][] data3x6ExpectedSampleAlgoRNum2 = 2325 [["field_a", "field_b", "field_c"], 2326 ["black", "黒", "0.983"], 2327 ["red", "赤", "23.8"]]; 2328 2329 string[][] data3x6ExpectedSampleAlgoRNum1 = 2330 [["field_a", "field_b", "field_c"], 2331 ["green", "緑", "0.0072"]]; 2332 2333 /* Inorder versions. */ 2334 string[][] data3x6ExpectedSampleAlgoRNum6Inorder = 2335 [["field_a", "field_b", "field_c"], 2336 ["red", "赤", "23.8"], 2337 ["green", "緑", "0.0072"], 2338 ["white", "白", "1.65"], 2339 ["yellow", "黄", "12"], 2340 ["blue", "青", "12"], 2341 ["black", "黒", "0.983"]]; 2342 2343 string[][] data3x6ExpectedSampleAlgoRNum5Inorder = 2344 [["field_a", "field_b", "field_c"], 2345 ["red", "赤", "23.8"], 2346 ["green", "緑", "0.0072"], 2347 ["white", "白", "1.65"], 2348 ["yellow", "黄", "12"], 2349 ["black", "黒", "0.983"]]; 2350 2351 string[][] data3x6ExpectedSampleAlgoRNum4Inorder = 2352 [["field_a", "field_b", "field_c"], 2353 ["green", "緑", "0.0072"], 2354 ["white", "白", "1.65"], 2355 ["blue", "青", "12"], 2356 ["black", "黒", "0.983"]]; 2357 2358 string[][] data3x6ExpectedSampleAlgoRNum3Inorder = 2359 [["field_a", "field_b", "field_c"], 2360 ["red", "赤", "23.8"], 2361 ["green", "緑", "0.0072"], 2362 ["black", "黒", "0.983"]]; 2363 2364 string[][] data3x6ExpectedSampleAlgoRNum2Inorder = 2365 [["field_a", "field_b", "field_c"], 2366 ["red", "赤", "23.8"], 2367 ["black", "黒", "0.983"]]; 2368 2369 string[][] data3x6ExpectedSampleAlgoRNum1Inorder = 2370 [["field_a", "field_b", "field_c"], 2371 ["green", "緑", "0.0072"]]; 2372 2373 /* Reservoir inorder */ 2374 string[][] data3x6ExpectedSampleCompatNum6Inorder = 2375 [["field_a", "field_b", "field_c"], 2376 ["red", "赤", "23.8"], 2377 ["green", "緑", "0.0072"], 2378 ["white", "白", "1.65"], 2379 ["yellow", "黄", "12"], 2380 ["blue", "青", "12"], 2381 ["black", "黒", "0.983"]]; 2382 2383 string[][] data3x6ExpectedSampleCompatNum5Inorder = 2384 [["field_a", "field_b", "field_c"], 2385 ["green", "緑", "0.0072"], 2386 ["white", "白", "1.65"], 2387 ["yellow", "黄", "12"], 2388 ["blue", "青", "12"], 2389 ["black", "黒", "0.983"]]; 2390 2391 string[][] data3x6ExpectedSampleCompatNum4Inorder = 2392 [["field_a", "field_b", "field_c"], 2393 ["white", "白", "1.65"], 2394 ["yellow", "黄", "12"], 2395 ["blue", "青", "12"], 2396 ["black", "黒", "0.983"]]; 2397 2398 string[][] data3x6ExpectedSampleCompatNum3Inorder = 2399 [["field_a", "field_b", "field_c"], 2400 ["yellow", "黄", "12"], 2401 ["blue", "青", "12"], 2402 ["black", "黒", "0.983"]]; 2403 2404 string[][] data3x6ExpectedSampleCompatNum2Inorder = 2405 [["field_a", "field_b", "field_c"], 2406 ["yellow", "黄", "12"], 2407 ["black", "黒", "0.983"]]; 2408 2409 string[][] data3x6ExpectedSampleCompatNum1Inorder = 2410 [["field_a", "field_b", "field_c"], 2411 ["yellow", "黄", "12"]]; 2412 2413 2414 /* Reservoir inorder with probabilities. */ 2415 string[][] data3x6ExpectedSampleCompatNum6ProbsInorder = 2416 [["random_value", "field_a", "field_b", "field_c"], 2417 ["0.010968807619065046", "red", "赤", "23.8"], 2418 ["0.15929344086907804", "green", "緑", "0.0072"], 2419 ["0.49287854949943721", "white", "白", "1.65"], 2420 ["0.96055546286515892", "yellow", "黄", "12"], 2421 ["0.52525980887003243", "blue", "青", "12"], 2422 ["0.75710153928957880", "black", "黒", "0.983"]]; 2423 2424 string[][] data3x6ExpectedSampleCompatNum5ProbsInorder = 2425 [["random_value", "field_a", "field_b", "field_c"], 2426 ["0.15929344086907804", "green", "緑", "0.0072"], 2427 ["0.49287854949943721", "white", "白", "1.65"], 2428 ["0.96055546286515892", "yellow", "黄", "12"], 2429 ["0.52525980887003243", "blue", "青", "12"], 2430 ["0.75710153928957880", "black", "黒", "0.983"]]; 2431 2432 string[][] data3x6ExpectedSampleCompatNum4ProbsInorder = 2433 [["random_value", "field_a", "field_b", "field_c"], 2434 ["0.49287854949943721", "white", "白", "1.65"], 2435 ["0.96055546286515892", "yellow", "黄", "12"], 2436 ["0.52525980887003243", "blue", "青", "12"], 2437 ["0.75710153928957880", "black", "黒", "0.983"]]; 2438 2439 string[][] data3x6ExpectedSampleCompatNum3ProbsInorder = 2440 [["random_value", "field_a", "field_b", "field_c"], 2441 ["0.96055546286515892", "yellow", "黄", "12"], 2442 ["0.52525980887003243", "blue", "青", "12"], 2443 ["0.75710153928957880", "black", "黒", "0.983"]]; 2444 2445 string[][] data3x6ExpectedSampleCompatNum2ProbsInorder = 2446 [["random_value", "field_a", "field_b", "field_c"], 2447 ["0.96055546286515892", "yellow", "黄", "12"], 2448 ["0.75710153928957880", "black", "黒", "0.983"]]; 2449 2450 string[][] data3x6ExpectedSampleCompatNum1ProbsInorder = 2451 [["random_value", "field_a", "field_b", "field_c"], 2452 ["0.96055546286515892", "yellow", "黄", "12"]]; 2453 2454 string[][] data3x6ExpectedWt3Num6Inorder = 2455 [["field_a", "field_b", "field_c"], 2456 ["red", "赤", "23.8"], 2457 ["green", "緑", "0.0072"], 2458 ["white", "白", "1.65"], 2459 ["yellow", "黄", "12"], 2460 ["blue", "青", "12"], 2461 ["black", "黒", "0.983"]]; 2462 2463 string[][] data3x6ExpectedWt3Num5Inorder = 2464 [["field_a", "field_b", "field_c"], 2465 ["green", "緑", "0.0072"], 2466 ["white", "白", "1.65"], 2467 ["yellow", "黄", "12"], 2468 ["blue", "青", "12"], 2469 ["black", "黒", "0.983"]]; 2470 2471 string[][] data3x6ExpectedWt3Num4Inorder = 2472 [["field_a", "field_b", "field_c"], 2473 ["white", "白", "1.65"], 2474 ["yellow", "黄", "12"], 2475 ["blue", "青", "12"], 2476 ["black", "黒", "0.983"]]; 2477 2478 string[][] data3x6ExpectedWt3Num3Inorder = 2479 [["field_a", "field_b", "field_c"], 2480 ["yellow", "黄", "12"], 2481 ["blue", "青", "12"], 2482 ["black", "黒", "0.983"]]; 2483 2484 string[][] data3x6ExpectedWt3Num2Inorder = 2485 [["field_a", "field_b", "field_c"], 2486 ["yellow", "黄", "12"], 2487 ["black", "黒", "0.983"]]; 2488 2489 string[][] data3x6ExpectedWt3Num1Inorder = 2490 [["field_a", "field_b", "field_c"], 2491 ["yellow", "黄", "12"]]; 2492 2493 2494 string[][] data3x6ExpectedBernoulliProbsP100 = 2495 [["random_value", "field_a", "field_b", "field_c"], 2496 ["0.010968807619065046", "red", "赤", "23.8"], 2497 ["0.15929344086907804", "green", "緑", "0.0072"], 2498 ["0.49287854949943721", "white", "白", "1.65"], 2499 ["0.96055546286515892", "yellow", "黄", "12"], 2500 ["0.52525980887003243", "blue", "青", "12"], 2501 ["0.75710153928957880", "black", "黒", "0.983"]]; 2502 2503 string[][] data3x6ExpectedBernoulliCompatProbsP60 = 2504 [["random_value", "field_a", "field_b", "field_c"], 2505 ["0.010968807619065046", "red", "赤", "23.8"], 2506 ["0.15929344086907804", "green", "緑", "0.0072"], 2507 ["0.49287854949943721", "white", "白", "1.65"], 2508 ["0.52525980887003243", "blue", "青", "12"]]; 2509 2510 string[][] data3x6ExpectedBernoulliSkipP40 = 2511 [["field_a", "field_b", "field_c"], 2512 ["red", "赤", "23.8"], 2513 ["green", "緑", "0.0072"], 2514 ["yellow", "黄", "12"]]; 2515 2516 string[][] data3x6ExpectedBernoulliCompatP60 = 2517 [["field_a", "field_b", "field_c"], 2518 ["red", "赤", "23.8"], 2519 ["green", "緑", "0.0072"], 2520 ["white", "白", "1.65"], 2521 ["blue", "青", "12"]]; 2522 2523 string[][] data3x6ExpectedDistinctK1K3P60 = 2524 [["field_a", "field_b", "field_c"], 2525 ["green", "緑", "0.0072"], 2526 ["white", "白", "1.65"], 2527 ["blue", "青", "12"]]; 2528 2529 string[][] data3x6ExpectedDistinctK1K3P60Probs = 2530 [["random_value", "field_a", "field_b", "field_c"], 2531 ["0", "green", "緑", "0.0072"], 2532 ["0", "white", "白", "1.65"], 2533 ["0", "blue", "青", "12"]]; 2534 2535 string[][] data3x6ExpectedDistinctK1K3P60ProbsRVCustom = 2536 [["custom_random_value_header", "field_a", "field_b", "field_c"], 2537 ["0", "green", "緑", "0.0072"], 2538 ["0", "white", "白", "1.65"], 2539 ["0", "blue", "青", "12"]]; 2540 2541 string[][] data3x6ExpectedDistinctK2P2ProbsInorder = 2542 [["random_value", "field_a", "field_b", "field_c"], 2543 ["1", "red", "赤", "23.8"], 2544 ["0", "green", "緑", "0.0072"], 2545 ["0", "white", "白", "1.65"], 2546 ["1", "yellow", "黄", "12"], 2547 ["3", "blue", "青", "12"], 2548 ["2", "black", "黒", "0.983"]]; 2549 2550 string[][] data3x6ExpectedPermuteWt3Probs = 2551 [["random_value", "field_a", "field_b", "field_c"], 2552 ["0.99665198757645390", "yellow", "黄", "12"], 2553 ["0.94775884809836686", "blue", "青", "12"], 2554 ["0.82728234682286661", "red", "赤", "23.8"], 2555 ["0.75346697377181959", "black", "黒", "0.983"], 2556 ["0.65130103496422487", "white", "白", "1.65"], 2557 ["1.5636943712879866e-111", "green", "緑", "0.0072"]]; 2558 2559 string[][] data3x6ExpectedWt3ProbsInorder = 2560 [["random_value", "field_a", "field_b", "field_c"], 2561 ["0.82728234682286661", "red", "赤", "23.8"], 2562 ["1.5636943712879866e-111", "green", "緑", "0.0072"], 2563 ["0.65130103496422487", "white", "白", "1.65"], 2564 ["0.99665198757645390", "yellow", "黄", "12"], 2565 ["0.94775884809836686", "blue", "青", "12"], 2566 ["0.75346697377181959", "black", "黒", "0.983"]]; 2567 2568 string[][] data3x6ExpectedPermuteWt3 = 2569 [["field_a", "field_b", "field_c"], 2570 ["yellow", "黄", "12"], 2571 ["blue", "青", "12"], 2572 ["red", "赤", "23.8"], 2573 ["black", "黒", "0.983"], 2574 ["white", "白", "1.65"], 2575 ["green", "緑", "0.0072"]]; 2576 2577 2578 string[][] data3x6ExpectedReplaceNum10 = 2579 [["field_a", "field_b", "field_c"], 2580 ["black", "黒", "0.983"], 2581 ["green", "緑", "0.0072"], 2582 ["green", "緑", "0.0072"], 2583 ["red", "赤", "23.8"], 2584 ["yellow", "黄", "12"], 2585 ["red", "赤", "23.8"], 2586 ["white", "白", "1.65"], 2587 ["yellow", "黄", "12"], 2588 ["yellow", "黄", "12"], 2589 ["white", "白", "1.65"], 2590 ]; 2591 2592 string[][] data3x6ExpectedReplaceNum10V77 = 2593 [["field_a", "field_b", "field_c"], 2594 ["black", "黒", "0.983"], 2595 ["red", "赤", "23.8"], 2596 ["black", "黒", "0.983"], 2597 ["yellow", "黄", "12"], 2598 ["green", "緑", "0.0072"], 2599 ["green", "緑", "0.0072"], 2600 ["green", "緑", "0.0072"], 2601 ["yellow", "黄", "12"], 2602 ["blue", "青", "12"], 2603 ["white", "白", "1.65"], 2604 ]; 2605 2606 /* Using a different static seed. */ 2607 string[][] data3x6ExpectedPermuteCompatV41Probs = 2608 [["random_value", "field_a", "field_b", "field_c"], 2609 ["0.68057272653095424", "green", "緑", "0.0072"], 2610 ["0.67681624367833138", "blue", "青", "12"], 2611 ["0.32097338931635022", "yellow", "黄", "12"], 2612 ["0.25092361867427826", "red", "赤", "23.8"], 2613 ["0.15535934292711318", "black", "黒", "0.983"], 2614 ["0.046095821075141430", "white", "白", "1.65"]]; 2615 2616 string[][] data3x6ExpectedBernoulliCompatP60V41Probs = 2617 [["random_value", "field_a", "field_b", "field_c"], 2618 ["0.25092361867427826", "red", "赤", "23.8"], 2619 ["0.046095821075141430", "white", "白", "1.65"], 2620 ["0.32097338931635022", "yellow", "黄", "12"], 2621 ["0.15535934292711318", "black", "黒", "0.983"]]; 2622 2623 string[][] data3x6ExpectedPermuteWt3V41Probs = 2624 [["random_value", "field_a", "field_b", "field_c"], 2625 ["0.96799377498910666", "blue", "青", "12"], 2626 ["0.94356245792573568", "red", "赤", "23.8"], 2627 ["0.90964601024271996", "yellow", "黄", "12"], 2628 ["0.15491658409260103", "white", "白", "1.65"], 2629 ["0.15043620392537033", "black", "黒", "0.983"], 2630 ["6.1394674830701461e-24", "green", "緑", "0.0072"]]; 2631 2632 string[][] data3x6ExpectedWt3V41ProbsInorder = 2633 [["random_value", "field_a", "field_b", "field_c"], 2634 ["0.94356245792573568", "red", "赤", "23.8"], 2635 ["6.1394674830701461e-24", "green", "緑", "0.0072"], 2636 ["0.15491658409260103", "white", "白", "1.65"], 2637 ["0.90964601024271996", "yellow", "黄", "12"], 2638 ["0.96799377498910666", "blue", "青", "12"], 2639 ["0.15043620392537033", "black", "黒", "0.983"]]; 2640 2641 2642 /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */ 2643 string[][] combo1ExpectedPermuteCompat = 2644 [["field_a", "field_b", "field_c"], 2645 ["yellow", "黄", "12"], 2646 ["tan", "タン", "8.5"], 2647 ["brown", "褐色", "29.2"], 2648 ["green", "緑", "0.0072"], 2649 ["red", "赤", "23.8"], 2650 ["purple", "紫の", "42"], 2651 ["black", "黒", "0.983"], 2652 ["white", "白", "1.65"], 2653 ["gray", "グレー", "6.2"], 2654 ["blue", "青", "12"], 2655 ["pink", "ピンク", "1.1"], 2656 ["orange", "オレンジ", "2.5"]]; 2657 2658 string[][] combo1ExpectedPermuteCompatProbs = 2659 [["random_value", "field_a", "field_b", "field_c"], 2660 ["0.97088520275428891", "yellow", "黄", "12"], 2661 ["0.96055546286515892", "tan", "タン", "8.5"], 2662 ["0.81756894313730299", "brown", "褐色", "29.2"], 2663 ["0.75710153928957880", "green", "緑", "0.0072"], 2664 ["0.52525980887003243", "red", "赤", "23.8"], 2665 ["0.49287854949943721", "purple", "紫の", "42"], 2666 ["0.47081507067196071", "black", "黒", "0.983"], 2667 ["0.38388182921335101", "white", "白", "1.65"], 2668 ["0.29215990612283349", "gray", "グレー", "6.2"], 2669 ["0.24033216014504433", "blue", "青", "12"], 2670 ["0.15929344086907804", "pink", "ピンク", "1.1"], 2671 ["0.010968807619065046", "orange", "オレンジ", "2.5"]]; 2672 2673 /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */ 2674 string[][] combo1ExpectedProbsInorder = 2675 [["random_value", "field_a", "field_b", "field_c"], 2676 ["0.010968807619065046", "orange", "オレンジ", "2.5"], 2677 ["0.15929344086907804", "pink", "ピンク", "1.1"], 2678 ["0.49287854949943721", "purple", "紫の", "42"], 2679 ["0.96055546286515892", "tan", "タン", "8.5"], 2680 ["0.52525980887003243", "red", "赤", "23.8"], 2681 ["0.75710153928957880", "green", "緑", "0.0072"], 2682 ["0.38388182921335101", "white", "白", "1.65"], 2683 ["0.97088520275428891", "yellow", "黄", "12"], 2684 ["0.24033216014504433", "blue", "青", "12"], 2685 ["0.47081507067196071", "black", "黒", "0.983"], 2686 ["0.81756894313730299", "brown", "褐色", "29.2"], 2687 ["0.29215990612283349", "gray", "グレー", "6.2"]]; 2688 2689 string[][] combo1ExpectedBernoulliCompatP50Probs = 2690 [["random_value", "field_a", "field_b", "field_c"], 2691 ["0.010968807619065046", "orange", "オレンジ", "2.5"], 2692 ["0.15929344086907804", "pink", "ピンク", "1.1"], 2693 ["0.49287854949943721", "purple", "紫の", "42"], 2694 ["0.38388182921335101", "white", "白", "1.65"], 2695 ["0.24033216014504433", "blue", "青", "12"], 2696 ["0.47081507067196071", "black", "黒", "0.983"], 2697 ["0.29215990612283349", "gray", "グレー", "6.2"]]; 2698 2699 string[][] combo1ExpectedBernoulliCompatP40 = 2700 [["field_a", "field_b", "field_c"], 2701 ["orange", "オレンジ", "2.5"], 2702 ["pink", "ピンク", "1.1"], 2703 ["white", "白", "1.65"], 2704 ["blue", "青", "12"], 2705 ["gray", "グレー", "6.2"]]; 2706 2707 string[][] combo1ExpectedDistinctK1P40 = 2708 [["field_a", "field_b", "field_c"], 2709 ["orange", "オレンジ", "2.5"], 2710 ["red", "赤", "23.8"], 2711 ["green", "緑", "0.0072"], 2712 ["blue", "青", "12"], 2713 ["black", "黒", "0.983"]]; 2714 2715 string[][] combo1ExpectedPermuteWt3Probs = 2716 [["random_value", "field_a", "field_b", "field_c"], 2717 ["0.99754077523718754", "yellow", "黄", "12"], 2718 ["0.99527665440088786", "tan", "タン", "8.5"], 2719 ["0.99312578945741659", "brown", "褐色", "29.2"], 2720 ["0.98329602553389361", "purple", "紫の", "42"], 2721 ["0.97330961938083660", "red", "赤", "23.8"], 2722 ["0.88797551521739648", "blue", "青", "12"], 2723 ["0.81999230489041786", "gray", "グレー", "6.2"], 2724 ["0.55975569204250941", "white", "白", "1.65"], 2725 ["0.46472135609205739", "black", "黒", "0.983"], 2726 ["0.18824582704191337", "pink", "ピンク", "1.1"], 2727 ["0.16446131853299920", "orange", "オレンジ", "2.5"], 2728 ["1.6438086931020549e-17", "green", "緑", "0.0072"]]; 2729 2730 string[][] combo1ExpectedPermuteWt3 = 2731 [["field_a", "field_b", "field_c"], 2732 ["yellow", "黄", "12"], 2733 ["tan", "タン", "8.5"], 2734 ["brown", "褐色", "29.2"], 2735 ["purple", "紫の", "42"], 2736 ["red", "赤", "23.8"], 2737 ["blue", "青", "12"], 2738 ["gray", "グレー", "6.2"], 2739 ["white", "白", "1.65"], 2740 ["black", "黒", "0.983"], 2741 ["pink", "ピンク", "1.1"], 2742 ["orange", "オレンジ", "2.5"], 2743 ["green", "緑", "0.0072"]]; 2744 2745 string[][] combo1ExpectedSampleAlgoRNum4 = 2746 [["field_a", "field_b", "field_c"], 2747 ["blue", "青", "12"], 2748 ["gray", "グレー", "6.2"], 2749 ["brown", "褐色", "29.2"], 2750 ["white", "白", "1.65"]]; 2751 2752 string[][] combo1ExpectedSampleAlgoRNum4Inorder = 2753 [["field_a", "field_b", "field_c"], 2754 ["white", "白", "1.65"], 2755 ["blue", "青", "12"], 2756 ["brown", "褐色", "29.2"], 2757 ["gray", "グレー", "6.2"]]; 2758 2759 string[][] combo1ExpectedReplaceNum10 = 2760 [["field_a", "field_b", "field_c"], 2761 ["gray", "グレー", "6.2"], 2762 ["yellow", "黄", "12"], 2763 ["yellow", "黄", "12"], 2764 ["white", "白", "1.65"], 2765 ["tan", "タン", "8.5"], 2766 ["white", "白", "1.65"], 2767 ["blue", "青", "12"], 2768 ["black", "黒", "0.983"], 2769 ["tan", "タン", "8.5"], 2770 ["purple", "紫の", "42"]]; 2771 2772 /* 1x200 - Needed for testing bernoulliSkipSampling, invoked with prob < 0.04. */ 2773 string[][] data1x200 = 2774 [["field_a"], 2775 ["000"], ["001"], ["002"], ["003"], ["004"], ["005"], ["006"], ["007"], ["008"], ["009"], 2776 ["010"], ["011"], ["012"], ["013"], ["014"], ["015"], ["016"], ["017"], ["018"], ["019"], 2777 ["020"], ["021"], ["022"], ["023"], ["024"], ["025"], ["026"], ["027"], ["028"], ["029"], 2778 ["030"], ["031"], ["032"], ["033"], ["034"], ["035"], ["036"], ["037"], ["038"], ["039"], 2779 ["040"], ["041"], ["042"], ["043"], ["044"], ["045"], ["046"], ["047"], ["048"], ["049"], 2780 ["050"], ["051"], ["052"], ["053"], ["054"], ["055"], ["056"], ["057"], ["058"], ["059"], 2781 ["060"], ["061"], ["062"], ["063"], ["064"], ["065"], ["066"], ["067"], ["068"], ["069"], 2782 ["070"], ["071"], ["072"], ["073"], ["074"], ["075"], ["076"], ["077"], ["078"], ["079"], 2783 ["080"], ["081"], ["082"], ["083"], ["084"], ["085"], ["086"], ["087"], ["088"], ["089"], 2784 ["090"], ["091"], ["092"], ["093"], ["094"], ["095"], ["096"], ["097"], ["098"], ["099"], 2785 ["100"], ["101"], ["102"], ["103"], ["104"], ["105"], ["106"], ["107"], ["108"], ["109"], 2786 ["110"], ["111"], ["112"], ["113"], ["114"], ["115"], ["116"], ["117"], ["118"], ["119"], 2787 ["120"], ["121"], ["122"], ["123"], ["124"], ["125"], ["126"], ["127"], ["128"], ["129"], 2788 ["130"], ["131"], ["132"], ["133"], ["134"], ["135"], ["136"], ["137"], ["138"], ["139"], 2789 ["140"], ["141"], ["142"], ["143"], ["144"], ["145"], ["146"], ["147"], ["148"], ["149"], 2790 ["150"], ["151"], ["152"], ["153"], ["154"], ["155"], ["156"], ["157"], ["158"], ["159"], 2791 ["160"], ["161"], ["162"], ["163"], ["164"], ["165"], ["166"], ["167"], ["168"], ["169"], 2792 ["170"], ["171"], ["172"], ["173"], ["174"], ["175"], ["176"], ["177"], ["178"], ["179"], 2793 ["180"], ["181"], ["182"], ["183"], ["184"], ["185"], ["186"], ["187"], ["188"], ["189"], 2794 ["190"], ["191"], ["192"], ["193"], ["194"], ["195"], ["196"], ["197"], ["198"], ["199"], 2795 ]; 2796 2797 string fpath_data1x200 = buildPath(testDir, "data1x200.tsv"); 2798 string fpath_data1x200_noheader = buildPath(testDir, "data1x200_noheader.tsv"); 2799 writeUnittestTsvFile(fpath_data1x200, data1x200); 2800 writeUnittestTsvFile(fpath_data1x200_noheader, data1x200[1 .. $]); 2801 2802 string[][] data1x200ExpectedBernoulliSkipV333P01 = 2803 [["field_a"], 2804 ["077"], 2805 ["119"]]; 2806 2807 string[][] data1x200ExpectedBernoulliSkipV333P02 = 2808 [["field_a"], 2809 ["038"], 2810 ["059"], 2811 ["124"], 2812 ["161"], 2813 ["162"], 2814 ["183"]]; 2815 2816 string[][] data1x200ExpectedBernoulliSkipV333P03 = 2817 [["field_a"], 2818 ["025"], 2819 ["039"], 2820 ["082"], 2821 ["107"], 2822 ["108"], 2823 ["122"], 2824 ["136"], 2825 ["166"], 2826 ["182"]]; 2827 2828 string[][] data1x200ExpectedBernoulliCompatV333P01 = 2829 [["field_a"], 2830 ["072"]]; 2831 2832 string[][] data1x200ExpectedBernoulliCompatV333P02 = 2833 [["field_a"], 2834 ["004"], 2835 ["072"]]; 2836 2837 string[][] data1x200ExpectedBernoulliCompatV333P03 = 2838 [["field_a"], 2839 ["004"], 2840 ["072"], 2841 ["181"]]; 2842 2843 /* Combo 2, for bernoulli skip sampling: 3x0, 3x1, 1x200, empty, 1x10. No data files, 2844 * only expected results. The header is from 3x0, the results are offset 1-position 2845 * from data1x200ExpectedBernoulliSkipV333P03 due to insertion of a single preceding line. 2846 */ 2847 string[][] combo2ExpectedBernoulliSkipV333P03 = 2848 [["field_a", "field_b", "field_c"], 2849 ["024"], 2850 ["038"], 2851 ["081"], 2852 ["106"], 2853 ["107"], 2854 ["121"], 2855 ["135"], 2856 ["165"], 2857 ["181"]]; 2858 2859 2860 /* 1x10 - Simple 1-column file. */ 2861 string[][] data1x10 = 2862 [["field_a"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"], ["10"]]; 2863 string fpath_data1x10 = buildPath(testDir, "data1x10.tsv"); 2864 string fpath_data1x10_noheader = buildPath(testDir, "data1x10_noheader.tsv"); 2865 writeUnittestTsvFile(fpath_data1x10, data1x10); 2866 writeUnittestTsvFile(fpath_data1x10_noheader, data1x10[1 .. $]); 2867 2868 string[][] data1x10ExpectedPermuteCompat = 2869 [["field_a"], ["8"], ["4"], ["6"], ["5"], ["3"], ["10"], ["7"], ["9"], ["2"], ["1"]]; 2870 2871 string[][] data1x10ExpectedPermuteWt1 = 2872 [["field_a"], ["8"], ["4"], ["6"], ["10"], ["5"], ["7"], ["9"], ["3"], ["2"], ["1"]]; 2873 2874 /* 2x10a - Uniform distribution [0,1]. */ 2875 string[][] data2x10a = 2876 [["line", "weight"], 2877 ["1", "0.26788837"], 2878 ["2", "0.06601298"], 2879 ["3", "0.38627527"], 2880 ["4", "0.47379424"], 2881 ["5", "0.02966641"], 2882 ["6", "0.05636231"], 2883 ["7", "0.70529242"], 2884 ["8", "0.91836862"], 2885 ["9", "0.99103720"], 2886 ["10", "0.31401740"]]; 2887 2888 string fpath_data2x10a = buildPath(testDir, "data2x10a.tsv"); 2889 writeUnittestTsvFile(fpath_data2x10a, data2x10a); 2890 2891 string[][] data2x10aExpectedPermuteWt2Probs = 2892 [["random_value", "line", "weight"], 2893 ["0.96833865494543658", "8", "0.91836862"], 2894 ["0.91856842054413923", "4", "0.47379424"], 2895 ["0.25730832087795091", "7", "0.70529242"], 2896 ["0.23725317907018120", "9", "0.99103720"], 2897 ["0.16016096701872204", "3", "0.38627527"], 2898 ["0.090819662667243381", "10", "0.31401740"], 2899 ["0.0071764539244361172", "6", "0.05636231"], 2900 ["0.000000048318642951630057", "1", "0.26788837"], 2901 ["0.00000000037525692966535517", "5", "0.02966641"], 2902 ["8.2123247880095796e-13", "2", "0.06601298"]]; 2903 2904 /* 2x10b - Uniform distribution [0,1000]. */ 2905 string[][] data2x10b = 2906 [["line", "weight"], 2907 ["1", "761"], 2908 ["2", "432"], 2909 ["3", "103"], 2910 ["4", "448"], 2911 ["5", "750"], 2912 ["6", "711"], 2913 ["7", "867"], 2914 ["8", "841"], 2915 ["9", "963"], 2916 ["10", "784"]]; 2917 2918 string fpath_data2x10b = buildPath(testDir, "data2x10b.tsv"); 2919 writeUnittestTsvFile(fpath_data2x10b, data2x10b); 2920 2921 string[][] data2x10bExpectedPermuteWt2Probs = 2922 [["random_value", "line", "weight"], 2923 ["0.99996486739067969", "8", "841"], 2924 ["0.99991017467137211", "4", "448"], 2925 ["0.99960871524873662", "6", "711"], 2926 ["0.99914188537143800", "5", "750"], 2927 ["0.99903963250274785", "10", "784"], 2928 ["0.99889631825931946", "7", "867"], 2929 ["0.99852058315191139", "9", "963"], 2930 ["0.99575669679158918", "2", "432"], 2931 ["0.99408758732050595", "1", "761"], 2932 ["0.99315467761212362", "3", "103"]]; 2933 2934 /* 2x10c - Logarithmic distribution in random order. */ 2935 string[][] data2x10c = 2936 [["line", "weight"], 2937 ["1", "31.85"], 2938 ["2", "17403.31"], 2939 ["3", "653.84"], 2940 ["4", "8.23"], 2941 ["5", "2671.04"], 2942 ["6", "26226.08"], 2943 ["7", "1.79"], 2944 ["8", "354.56"], 2945 ["9", "35213.81"], 2946 ["10", "679.29"]]; 2947 2948 string fpath_data2x10c = buildPath(testDir, "data2x10c.tsv"); 2949 writeUnittestTsvFile(fpath_data2x10c, data2x10c); 2950 2951 string[][] data2x10cExpectedPermuteWt2Probs = 2952 [["random_value", "line", "weight"], 2953 ["0.99998939008709697", "6", "26226.08"], 2954 ["0.99995951291695517", "9", "35213.81"], 2955 ["0.99991666907613541", "8", "354.56"], 2956 ["0.99989445052186410", "2", "17403.31"], 2957 ["0.99975897602861630", "5", "2671.04"], 2958 ["0.99891852769877643", "3", "653.84"], 2959 ["0.99889167752782515", "10", "679.29"], 2960 ["0.99512207506850148", "4", "8.23"], 2961 ["0.86789371584259023", "1", "31.85"], 2962 ["0.58574438162915610", "7", "1.79"]]; 2963 2964 /* 2x10d. Logarithmic distribution in ascending order. */ 2965 string[][] data2x10d = 2966 [["line", "weight"], 2967 ["1", "1.79"], 2968 ["2", "8.23"], 2969 ["3", "31.85"], 2970 ["4", "354.56"], 2971 ["5", "653.84"], 2972 ["6", "679.29"], 2973 ["7", "2671.04"], 2974 ["8", "17403.31"], 2975 ["9", "26226.08"], 2976 ["10", "35213.81"]]; 2977 2978 string fpath_data2x10d = buildPath(testDir, "data2x10d.tsv"); 2979 writeUnittestTsvFile(fpath_data2x10d, data2x10d); 2980 2981 string[][] data2x10dExpectedPermuteWt2Probs = 2982 [["random_value", "line", "weight"], 2983 ["0.99999830221846353", "8", "17403.31"], 2984 ["0.99997860834041397", "10", "35213.81"], 2985 ["0.99994563828986716", "9", "26226.08"], 2986 ["0.99988650363575737", "4", "354.56"], 2987 ["0.99964161939190088", "7", "2671.04"], 2988 ["0.99959045338948649", "6", "679.29"], 2989 ["0.99901574490639788", "5", "653.84"], 2990 ["0.97803163304747431", "3", "31.85"], 2991 ["0.79994791806910948", "2", "8.23"], 2992 ["0.080374261239949119", "1", "1.79"]]; 2993 2994 /* 2x10e. Logarithmic distribution in descending order. */ 2995 string[][] data2x10e = 2996 [["line", "weight"], 2997 ["1", "35213.81"], 2998 ["2", "26226.08"], 2999 ["3", "17403.31"], 3000 ["4", "2671.04"], 3001 ["5", "679.29"], 3002 ["6", "653.84"], 3003 ["7", "354.56"], 3004 ["8", "31.85"], 3005 ["9", "8.23"], 3006 ["10", "1.79"]]; 3007 string fpath_data2x10e = buildPath(testDir, "data2x10e.tsv"); 3008 writeUnittestTsvFile(fpath_data2x10e, data2x10e); 3009 3010 string[][] data2x10eExpectedPermuteWt2Probs = 3011 [["random_value", "line", "weight"], 3012 ["0.99998493348975237", "4", "2671.04"], 3013 ["0.99995934807202624", "3", "17403.31"], 3014 ["0.99992995739727453", "2", "26226.08"], 3015 ["0.99987185679245649", "1", "35213.81"], 3016 ["0.99957451563173938", "6", "653.84"], 3017 ["0.99907273650209583", "8", "31.85"], 3018 ["0.99905260312968946", "5", "679.29"], 3019 ["0.99730333650516401", "7", "354.56"], 3020 ["0.84093902435227808", "9", "8.23"], 3021 ["0.65650015926290028", "10", "1.79"]]; 3022 3023 /* Data sets for distinct sampling. */ 3024 string[][] data5x25 = 3025 [["ID", "Shape", "Color", "Size", "Weight"], 3026 ["01", "circle", "red", "S", "10"], 3027 ["02", "circle", "black", "L", "20"], 3028 ["03", "square", "black", "L", "20"], 3029 ["04", "circle", "green", "L", "30"], 3030 ["05", "ellipse", "red", "S", "20"], 3031 ["06", "triangle", "red", "S", "10"], 3032 ["07", "triangle", "red", "L", "20"], 3033 ["08", "square", "black", "S", "10"], 3034 ["09", "circle", "black", "S", "20"], 3035 ["10", "square", "green", "L", "20"], 3036 ["11", "triangle", "red", "L", "20"], 3037 ["12", "circle", "green", "L", "30"], 3038 ["13", "ellipse", "red", "S", "20"], 3039 ["14", "circle", "green", "L", "30"], 3040 ["15", "ellipse", "red", "L", "30"], 3041 ["16", "square", "red", "S", "10"], 3042 ["17", "circle", "black", "L", "20"], 3043 ["18", "square", "red", "S", "20"], 3044 ["19", "square", "black", "L", "20"], 3045 ["20", "circle", "red", "S", "10"], 3046 ["21", "ellipse", "black", "L", "30"], 3047 ["22", "triangle", "red", "L", "30"], 3048 ["23", "circle", "green", "S", "20"], 3049 ["24", "square", "green", "L", "20"], 3050 ["25", "circle", "red", "S", "10"], 3051 ]; 3052 3053 string fpath_data5x25 = buildPath(testDir, "data5x25.tsv"); 3054 string fpath_data5x25_noheader = buildPath(testDir, "data5x25_noheader.tsv"); 3055 writeUnittestTsvFile(fpath_data5x25, data5x25); 3056 writeUnittestTsvFile(fpath_data5x25_noheader, data5x25[1 .. $]); 3057 3058 string[][] data5x25ExpectedDistinctK2P40 = 3059 [["ID", "Shape", "Color", "Size", "Weight"], 3060 ["03", "square", "black", "L", "20"], 3061 ["05", "ellipse", "red", "S", "20"], 3062 ["08", "square", "black", "S", "10"], 3063 ["10", "square", "green", "L", "20"], 3064 ["13", "ellipse", "red", "S", "20"], 3065 ["15", "ellipse", "red", "L", "30"], 3066 ["16", "square", "red", "S", "10"], 3067 ["18", "square", "red", "S", "20"], 3068 ["19", "square", "black", "L", "20"], 3069 ["21", "ellipse", "black", "L", "30"], 3070 ["24", "square", "green", "L", "20"], 3071 ]; 3072 3073 string[][] data5x25ExpectedDistinctK2K4P20 = 3074 [["ID", "Shape", "Color", "Size", "Weight"], 3075 ["03", "square", "black", "L", "20"], 3076 ["07", "triangle", "red", "L", "20"], 3077 ["08", "square", "black", "S", "10"], 3078 ["10", "square", "green", "L", "20"], 3079 ["11", "triangle", "red", "L", "20"], 3080 ["16", "square", "red", "S", "10"], 3081 ["18", "square", "red", "S", "20"], 3082 ["19", "square", "black", "L", "20"], 3083 ["22", "triangle", "red", "L", "30"], 3084 ["24", "square", "green", "L", "20"], 3085 ]; 3086 3087 string[][] data5x25ExpectedDistinctK2K3K4P20 = 3088 [["ID", "Shape", "Color", "Size", "Weight"], 3089 ["04", "circle", "green", "L", "30"], 3090 ["07", "triangle", "red", "L", "20"], 3091 ["09", "circle", "black", "S", "20"], 3092 ["11", "triangle", "red", "L", "20"], 3093 ["12", "circle", "green", "L", "30"], 3094 ["14", "circle", "green", "L", "30"], 3095 ["16", "square", "red", "S", "10"], 3096 ["18", "square", "red", "S", "20"], 3097 ["22", "triangle", "red", "L", "30"], 3098 ]; 3099 3100 /* Fields 2 and 4 from data5x25. Distinct rows should be the same for equiv keys. */ 3101 string[][] data2x25 = 3102 [["Shape", "Size"], 3103 ["circle", "S"], 3104 ["circle", "L"], 3105 ["square", "L"], 3106 ["circle", "L"], 3107 ["ellipse", "S"], 3108 ["triangle", "S"], 3109 ["triangle", "L"], 3110 ["square", "S"], 3111 ["circle", "S"], 3112 ["square", "L"], 3113 ["triangle", "L"], 3114 ["circle", "L"], 3115 ["ellipse", "S"], 3116 ["circle", "L"], 3117 ["ellipse", "L"], 3118 ["square", "S"], 3119 ["circle", "L"], 3120 ["square", "S"], 3121 ["square", "L"], 3122 ["circle", "S"], 3123 ["ellipse", "L"], 3124 ["triangle", "L"], 3125 ["circle", "S"], 3126 ["square", "L"], 3127 ["circle", "S"], 3128 ]; 3129 3130 string fpath_data2x25 = buildPath(testDir, "data2x25.tsv"); 3131 string fpath_data2x25_noheader = buildPath(testDir, "data2x25_noheader.tsv"); 3132 writeUnittestTsvFile(fpath_data2x25, data2x25); 3133 writeUnittestTsvFile(fpath_data2x25_noheader, data2x25[1 .. $]); 3134 3135 string[][] data2x25ExpectedDistinctK1K2P20 = 3136 [["Shape", "Size"], 3137 ["square", "L"], 3138 ["triangle", "L"], 3139 ["square", "S"], 3140 ["square", "L"], 3141 ["triangle", "L"], 3142 ["square", "S"], 3143 ["square", "S"], 3144 ["square", "L"], 3145 ["triangle", "L"], 3146 ["square", "L"], 3147 ]; 3148 3149 string[][] data1x25 = 3150 [["Shape-Size"], 3151 ["circle-S"], 3152 ["circle-L"], 3153 ["square-L"], 3154 ["circle-L"], 3155 ["ellipse-S"], 3156 ["triangle-S"], 3157 ["triangle-L"], 3158 ["square-S"], 3159 ["circle-S"], 3160 ["square-L"], 3161 ["triangle-L"], 3162 ["circle-L"], 3163 ["ellipse-S"], 3164 ["circle-L"], 3165 ["ellipse-L"], 3166 ["square-S"], 3167 ["circle-L"], 3168 ["square-S"], 3169 ["square-L"], 3170 ["circle-S"], 3171 ["ellipse-L"], 3172 ["triangle-L"], 3173 ["circle-S"], 3174 ["square-L"], 3175 ["circle-S"], 3176 ]; 3177 3178 string fpath_data1x25 = buildPath(testDir, "data1x25.tsv"); 3179 string fpath_data1x25_noheader = buildPath(testDir, "data1x25_noheader.tsv"); 3180 writeUnittestTsvFile(fpath_data1x25, data1x25); 3181 writeUnittestTsvFile(fpath_data1x25_noheader, data1x25[1 .. $]); 3182 3183 string[][] data1x25ExpectedDistinctK1P20 = 3184 [["Shape-Size"], 3185 ["triangle-L"], 3186 ["square-S"], 3187 ["triangle-L"], 3188 ["ellipse-L"], 3189 ["square-S"], 3190 ["square-S"], 3191 ["ellipse-L"], 3192 ["triangle-L"], 3193 ]; 3194 3195 string[][] data1x25ExpectedDistinctK1P20Probs = 3196 [["random_value", "Shape-Size"], 3197 ["0", "triangle-L"], 3198 ["0", "square-S"], 3199 ["0", "triangle-L"], 3200 ["0", "ellipse-L"], 3201 ["0", "square-S"], 3202 ["0", "square-S"], 3203 ["0", "ellipse-L"], 3204 ["0", "triangle-L"], 3205 ]; 3206 3207 string[][] data1x25ExpectedDistinctK1P20ProbsInorder = 3208 [["random_value", "Shape-Size"], 3209 ["1", "circle-S"], 3210 ["4", "circle-L"], 3211 ["2", "square-L"], 3212 ["4", "circle-L"], 3213 ["2", "ellipse-S"], 3214 ["1", "triangle-S"], 3215 ["0", "triangle-L"], 3216 ["0", "square-S"], 3217 ["1", "circle-S"], 3218 ["2", "square-L"], 3219 ["0", "triangle-L"], 3220 ["4", "circle-L"], 3221 ["2", "ellipse-S"], 3222 ["4", "circle-L"], 3223 ["0", "ellipse-L"], 3224 ["0", "square-S"], 3225 ["4", "circle-L"], 3226 ["0", "square-S"], 3227 ["2", "square-L"], 3228 ["1", "circle-S"], 3229 ["0", "ellipse-L"], 3230 ["0", "triangle-L"], 3231 ["1", "circle-S"], 3232 ["2", "square-L"], 3233 ["1", "circle-S"], 3234 ]; 3235 3236 /* 3237 * Enough setup! Actually run some tests! 3238 */ 3239 3240 /* Shuffling tests. Headers, static seed, compatibility mode. With weights and without. */ 3241 testTsvSample(["test-a1", "--header", "--static-seed", "--compatibility-mode", fpath_dataEmpty], dataEmpty); 3242 testTsvSample(["test-a2", "--header", "--static-seed", "--compatibility-mode", fpath_data3x0], data3x0); 3243 testTsvSample(["test-a3", "-H", "-s", "--compatibility-mode", fpath_data3x1], data3x1); 3244 testTsvSample(["test-a4", "-H", "-s", "--compatibility-mode", fpath_data3x2], data3x2PermuteCompat); 3245 testTsvSample(["test-a5", "-H", "-s", "--compatibility-mode", fpath_data3x3], data3x3ExpectedPermuteCompat); 3246 testTsvSample(["test-a6", "-H", "-s", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat); 3247 testTsvSample(["test-a7", "-H", "-s", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs); 3248 testTsvSample(["test-a8", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3); 3249 testTsvSample(["test-a9", "-H", "-s", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 3250 testTsvSample(["test-a10", "-H", "--seed-value", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); 3251 testTsvSample(["test-a11", "-H", "-s", "-v", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); 3252 testTsvSample(["test-a12", "-H", "-s", "-v", "0", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs); 3253 testTsvSample(["test-a13", "-H", "-v", "41", "-w", "3", "--print-random", fpath_data3x6], data3x6ExpectedPermuteWt3V41Probs); 3254 3255 /* Shuffling, without compatibility mode, or with both compatibility and printing. */ 3256 testTsvSample(["test-aa1", "--header", "--static-seed", fpath_dataEmpty], dataEmpty); 3257 testTsvSample(["test-aa2", "--header", "--static-seed", fpath_data3x0], data3x0); 3258 testTsvSample(["test-aa3", "-H", "-s", fpath_data3x1], data3x1); 3259 testTsvSample(["test-aa4", "-H", "-s", fpath_data3x2], data3x2PermuteShuffle); 3260 testTsvSample(["test-aa5", "-H", "-s", fpath_data3x3], data3x3ExpectedPermuteSwap); 3261 testTsvSample(["test-aa6", "-H", "-s", fpath_data3x6], data3x6ExpectedPermuteSwap); 3262 testTsvSample(["test-aa7", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3); 3263 testTsvSample(["test-aa8", "-H", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 3264 testTsvSample(["test-aa9", "-H", "--seed-value", "41", "--print-random", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); 3265 3266 /* Reservoir sampling using Algorithm R. 3267 * (Note: reservoirSamplingViaHeap is tested later in the length-based iteration loops.) 3268 */ 3269 testTsvSample(["test-aa10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty); 3270 testTsvSample(["test-aa11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty); 3271 testTsvSample(["test-aa12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x0], data3x0); 3272 testTsvSample(["test-aa13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x0], data3x0); 3273 testTsvSample(["test-aa14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x1], data3x1); 3274 testTsvSample(["test-aa15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x1], data3x1); 3275 testTsvSample(["test-aa16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6); 3276 testTsvSample(["test-aa17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6); 3277 testTsvSample(["test-aa18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", fpath_data3x6], data3x6ExpectedSampleAlgoRNum5); 3278 testTsvSample(["test-aa19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", fpath_data3x6], data3x6ExpectedSampleAlgoRNum4); 3279 testTsvSample(["test-aa20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", fpath_data3x6], data3x6ExpectedSampleAlgoRNum3); 3280 testTsvSample(["test-aa21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x6], data3x6ExpectedSampleAlgoRNum2); 3281 testTsvSample(["test-aa22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x6], data3x6ExpectedSampleAlgoRNum1); 3282 3283 /* Inorder versions of Algorithm R tests. */ 3284 testTsvSample(["test-ai10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", "--inorder", fpath_dataEmpty], dataEmpty); 3285 testTsvSample(["test-ai11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", "--inorder", fpath_dataEmpty], dataEmpty); 3286 testTsvSample(["test-ai12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "--inorder", fpath_data3x0], data3x0); 3287 testTsvSample(["test-ai13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "--inorder", fpath_data3x0], data3x0); 3288 testTsvSample(["test-ai14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "--inorder", fpath_data3x1], data3x1); 3289 testTsvSample(["test-ai15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "-i", fpath_data3x1], data3x1); 3290 testTsvSample(["test-ai16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6Inorder); 3291 testTsvSample(["test-ai17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6Inorder); 3292 testTsvSample(["test-ai18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum5Inorder); 3293 testTsvSample(["test-ai19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum4Inorder); 3294 testTsvSample(["test-ai20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum3Inorder); 3295 testTsvSample(["test-ai21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum2Inorder); 3296 testTsvSample(["test-ai22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum1Inorder); 3297 3298 /* Bernoulli sampling cases. */ 3299 testTsvSample(["test-a14", "--header", "--static-seed", "--prob", "0.001", fpath_dataEmpty], dataEmpty); 3300 testTsvSample(["test-a15", "--header", "--static-seed", "--prob", "0.001", fpath_data3x0], data3x0); 3301 testTsvSample(["test-a16", "-H", "-s", "-p", "1.0", fpath_data3x1], data3x1); 3302 testTsvSample(["test-a17", "-H", "-s", "-p", "1.0", fpath_data3x6], data3x6); 3303 testTsvSample(["test-a18", "-H", "-p", "1.0", fpath_data3x6], data3x6); 3304 testTsvSample(["test-a19", "-H", "-s", "--prob", "1.0", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 3305 testTsvSample(["test-a20", "-H", "-s", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60); 3306 testTsvSample(["test-a21", "-H", "-s", "--prob", "0.60", fpath_data3x6], data3x6ExpectedBernoulliCompatP60); 3307 testTsvSample(["test-a22", "-H", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatP60V41Probs); 3308 3309 /* Bernoulli sampling with probabilities in skip sampling range or preferring skip sampling. */ 3310 testTsvSample(["test-ab1", "-H", "--seed-value", "333", "--prob", "0.01", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P01); 3311 testTsvSample(["test-ab2", "-H", "--seed-value", "333", "--prob", "0.02", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P02); 3312 testTsvSample(["test-ab3", "-H", "--seed-value", "333", "--prob", "0.03", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03); 3313 testTsvSample(["test-ab4", "-H", "--seed-value", "333", "--prob", "0.01", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P01); 3314 testTsvSample(["test-ab5", "-H", "--seed-value", "333", "--prob", "0.02", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P02); 3315 testTsvSample(["test-ab6", "-H", "--seed-value", "333", "--prob", "0.03", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P03); 3316 testTsvSample(["test-ab7", "-H", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6], data3x6ExpectedBernoulliSkipP40); 3317 3318 /* Distinct sampling cases. */ 3319 testTsvSample(["test-a23", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_dataEmpty], dataEmpty); 3320 testTsvSample(["test-a24", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_data3x0], data3x0); 3321 testTsvSample(["test-a25", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x1], data3x1); 3322 testTsvSample(["test-a26", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x6], data3x6); 3323 testTsvSample(["test-a27", "-H", "-s", "-p", "0.6", "-k", "1,3", fpath_data3x6], data3x6ExpectedDistinctK1K3P60); 3324 3325 3326 /* Generating random weights. Use Bernoulli sampling test set at prob 100% for uniform sampling. 3327 * For weighted sampling, use the weighted cases, but with expected using the original ordering. 3328 */ 3329 testTsvSample(["test-a28", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 3330 testTsvSample(["test-a29", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 3331 testTsvSample(["test-a30", "-H", "-s", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6], 3332 data3x6ExpectedWt3ProbsInorder); 3333 testTsvSample(["test-a31", "-H", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6], 3334 data3x6ExpectedWt3V41ProbsInorder); 3335 testTsvSample(["test-a32", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6], 3336 data3x6ExpectedDistinctK1K3P60Probs); 3337 testTsvSample(["test-a33", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", "--random-value-header", 3338 "custom_random_value_header", fpath_data3x6], data3x6ExpectedDistinctK1K3P60ProbsRVCustom); 3339 testTsvSample(["test-a34", "-H", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6], 3340 data3x6ExpectedDistinctK2P2ProbsInorder); 3341 3342 /* Simple random sampling with replacement. */ 3343 testTsvSample(["test-a35", "-H", "-s", "--replace", fpath_dataEmpty], dataEmpty); 3344 testTsvSample(["test-a36", "-H", "-s", "--replace", "--num", "3", fpath_dataEmpty], dataEmpty); 3345 testTsvSample(["test-a37", "-H", "-s", "--replace", fpath_data3x0], data3x0); 3346 testTsvSample(["test-a38", "-H", "-s", "--replace", "--num", "3", fpath_data3x0], data3x0); 3347 testTsvSample(["test-a39", "-H", "-s", "--replace", "--num", "3", fpath_data3x1], data3x1ExpectedReplaceNum3); 3348 testTsvSample(["test-a40", "-H", "-s", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10); 3349 testTsvSample(["test-a41", "-H", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10V77); 3350 3351 /* Shuffling, compatibility mode, without headers. */ 3352 testTsvSample(["test-b1", "-s", "--compatibility-mode", fpath_data3x1_noheader], data3x1[1 .. $]); 3353 testTsvSample(["test-b2", "-s", "--compatibility-mode", fpath_data3x2_noheader], data3x2PermuteCompat[1 .. $]); 3354 testTsvSample(["test-b3", "-s", "--compatibility-mode", fpath_data3x3_noheader], data3x3ExpectedPermuteCompat[1 .. $]); 3355 testTsvSample(["test-b4", "-s", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1 .. $]); 3356 testTsvSample(["test-b5", "-s", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1 .. $]); 3357 testTsvSample(["test-b6", "-s", "--weight-field", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1 .. $]); 3358 testTsvSample(["test-b7", "-s", "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1 .. $]); 3359 testTsvSample(["test-b8", "-v", "41", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1 .. $]); 3360 testTsvSample(["test-b9", "-v", "41", "-w", "3", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3V41Probs[1 .. $]); 3361 3362 /* Shuffling, no headers, without compatibility mode, or with printing and compatibility mode. */ 3363 testTsvSample(["test-bb1", "-s", fpath_data3x1_noheader], data3x1[1 .. $]); 3364 testTsvSample(["test-bb2", "-s", fpath_data3x2_noheader], data3x2PermuteShuffle[1 .. $]); 3365 testTsvSample(["test-bb3", "-s", fpath_data3x3_noheader], data3x3ExpectedPermuteSwap[1 .. $]); 3366 testTsvSample(["test-bb4", "-s", fpath_data3x6_noheader], data3x6ExpectedPermuteSwap[1 .. $]); 3367 testTsvSample(["test-bb5", "-s", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1 .. $]); 3368 testTsvSample(["test-bb6", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1 .. $]); 3369 testTsvSample(["test-bb7", "-v", "41", "--print-random", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1 .. $]); 3370 3371 /* Reservoir sampling using Algorithm R, no headers. */ 3372 testTsvSample(["test-ac10", "--prefer-algorithm-r", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty); 3373 testTsvSample(["test-ac11", "--prefer-algorithm-r", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty); 3374 testTsvSample(["test-ac14", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x1_noheader], data3x1[1 .. $]); 3375 testTsvSample(["test-ac15", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x1_noheader], data3x1[1 .. $]); 3376 testTsvSample(["test-ac16", "--prefer-algorithm-r", "-s", "--num", "7", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6[1 .. $]); 3377 testTsvSample(["test-ac17", "--prefer-algorithm-r", "-s", "--num", "6", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6[1 .. $]); 3378 testTsvSample(["test-ac18", "--prefer-algorithm-r", "-s", "--num", "5", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum5[1 .. $]); 3379 testTsvSample(["test-ac19", "--prefer-algorithm-r", "-s", "--num", "4", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum4[1 .. $]); 3380 testTsvSample(["test-ac20", "--prefer-algorithm-r", "-s", "--num", "3", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum3[1 .. $]); 3381 testTsvSample(["test-ac21", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum2[1 .. $]); 3382 testTsvSample(["test-ac22", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum1[1 .. $]); 3383 3384 /* Reservoir sampling using Algorithm R, no headers, inorder output. */ 3385 testTsvSample(["test-aj10", "--prefer-algorithm-r", "--static-seed", "--num", "1", "-i", fpath_dataEmpty], dataEmpty); 3386 testTsvSample(["test-aj11", "--prefer-algorithm-r", "--static-seed", "--num", "2", "-i", fpath_dataEmpty], dataEmpty); 3387 testTsvSample(["test-aj14", "--prefer-algorithm-r", "-s", "--num", "1", "-i", fpath_data3x1_noheader], data3x1[1 .. $]); 3388 testTsvSample(["test-aj15", "--prefer-algorithm-r", "-s", "--num", "2", "-i", fpath_data3x1_noheader], data3x1[1 .. $]); 3389 testTsvSample(["test-aj16", "--prefer-algorithm-r", "-s", "--num", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6Inorder[1 .. $]); 3390 testTsvSample(["test-aj17", "--prefer-algorithm-r", "-s", "--num", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6Inorder[1 .. $]); 3391 testTsvSample(["test-aj18", "--prefer-algorithm-r", "-s", "--num", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum5Inorder[1 .. $]); 3392 testTsvSample(["test-aj19", "--prefer-algorithm-r", "-s", "--num", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum4Inorder[1 .. $]); 3393 testTsvSample(["test-aj20", "--prefer-algorithm-r", "-s", "--num", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum3Inorder[1 .. $]); 3394 testTsvSample(["test-aj21", "--prefer-algorithm-r", "-s", "--num", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum2Inorder[1 .. $]); 3395 testTsvSample(["test-aj22", "--prefer-algorithm-r", "-s", "--num", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum1Inorder[1 .. $]); 3396 3397 /* Bernoulli sampling cases. */ 3398 testTsvSample(["test-b10", "-s", "-p", "1.0", fpath_data3x1_noheader], data3x1[1 .. $]); 3399 testTsvSample(["test-b11", "-s", "-p", "1.0", fpath_data3x6_noheader], data3x6[1 .. $]); 3400 testTsvSample(["test-b12", "-p", "1.0", fpath_data3x6_noheader], data3x6[1 .. $]); 3401 testTsvSample(["test-b13", "-s", "--prob", "1.0", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1 .. $]); 3402 testTsvSample(["test-b14", "-s", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1 .. $]); 3403 testTsvSample(["test-b15", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60V41Probs[1 .. $]); 3404 3405 /* Bernoulli sampling with probabilities in skip sampling range. */ 3406 testTsvSample(["test-bb1", "-v", "333", "-p", "0.01", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P01[1 .. $]); 3407 testTsvSample(["test-bb2", "-v", "333", "-p", "0.02", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P02[1 .. $]); 3408 testTsvSample(["test-bb3", "-v", "333", "-p", "0.03", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1 .. $]); 3409 testTsvSample(["test-bb4", "-v", "333", "-p", "0.01", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P01[1 .. $]); 3410 testTsvSample(["test-bb5", "-v", "333", "-p", "0.02", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P02[1 .. $]); 3411 testTsvSample(["test-bb6", "-v", "333", "-p", "0.03", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P03[1 .. $]); 3412 testTsvSample(["test-bb7", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6_noheader], data3x6ExpectedBernoulliSkipP40[1 .. $]); 3413 3414 /* Distinct sampling cases. */ 3415 testTsvSample(["test-b16", "-s", "-p", "1.0", "-k", "2", fpath_data3x1_noheader], data3x1[1 .. $]); 3416 testTsvSample(["test-b17", "-s", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]); 3417 testTsvSample(["test-b18", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]); 3418 testTsvSample(["test-b19", "-v", "71563", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]); 3419 3420 /* Generating random weights. Reuse Bernoulli sampling tests at prob 100%. */ 3421 testTsvSample(["test-b20", "-s", "--gen-random-inorder", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1 .. $]); 3422 testTsvSample(["test-b23", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedWt3V41ProbsInorder[1 .. $]); 3423 testTsvSample(["test-b24", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6_noheader], 3424 data3x6ExpectedDistinctK1K3P60Probs[1 .. $]); 3425 testTsvSample(["test-b24", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6_noheader], 3426 data3x6ExpectedDistinctK2P2ProbsInorder[1 .. $]); 3427 3428 /* Simple random sampling with replacement. */ 3429 testTsvSample(["test-b25", "-s", "--replace", fpath_dataEmpty], dataEmpty); 3430 testTsvSample(["test-b26", "-s", "-r", "--num", "3", fpath_dataEmpty], dataEmpty); 3431 testTsvSample(["test-b27", "-s", "-r", "-n", "3", fpath_data3x1_noheader], data3x1ExpectedReplaceNum3[1 .. $]); 3432 testTsvSample(["test-b28", "-s", "--replace", "-n", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10[1 .. $]); 3433 testTsvSample(["test-b29", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10V77[1 .. $]); 3434 3435 /* Multi-file tests. */ 3436 testTsvSample(["test-c1", "--header", "--static-seed", "--compatibility-mode", 3437 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3438 combo1ExpectedPermuteCompat); 3439 testTsvSample(["test-c2", "--header", "--static-seed", "--print-random", 3440 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3441 combo1ExpectedPermuteCompatProbs); 3442 testTsvSample(["test-c3", "--header", "--static-seed", "--print-random", "--weight-field", "3", 3443 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3444 combo1ExpectedPermuteWt3Probs); 3445 testTsvSample(["test-c4", "--header", "--static-seed", "--weight-field", "3", "--compatibility-mode", 3446 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3447 combo1ExpectedPermuteWt3); 3448 testTsvSample(["test-c5", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4", 3449 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3450 combo1ExpectedSampleAlgoRNum4); 3451 testTsvSample(["test-c5b", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4", "--inorder", 3452 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3453 combo1ExpectedSampleAlgoRNum4Inorder); 3454 3455 /* Multi-file, no headers. */ 3456 testTsvSample(["test-c6", "--static-seed", "--compatibility-mode", 3457 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3458 fpath_data3x6_noheader, fpath_data3x2_noheader], 3459 combo1ExpectedPermuteCompat[1 .. $]); 3460 testTsvSample(["test-c7", "--static-seed", "--print-random", 3461 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3462 fpath_data3x6_noheader, fpath_data3x2_noheader], 3463 combo1ExpectedPermuteCompatProbs[1 .. $]); 3464 testTsvSample(["test-c8", "--static-seed", "--print-random", "--weight-field", "3", 3465 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3466 fpath_data3x6_noheader, fpath_data3x2_noheader], 3467 combo1ExpectedPermuteWt3Probs[1 .. $]); 3468 testTsvSample(["test-c9", "--static-seed", "--weight-field", "3", "--compatibility-mode", 3469 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3470 fpath_data3x6_noheader, fpath_data3x2_noheader], 3471 combo1ExpectedPermuteWt3[1 .. $]); 3472 testTsvSample(["test-c10", "--static-seed", "--prefer-algorithm-r", "--num", "4", 3473 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3474 fpath_data3x6_noheader, fpath_data3x2_noheader], 3475 combo1ExpectedSampleAlgoRNum4[1 .. $]); 3476 testTsvSample(["test-c10b", "--static-seed", "--prefer-algorithm-r", "--num", "4", "--inorder", 3477 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3478 fpath_data3x6_noheader, fpath_data3x2_noheader], 3479 combo1ExpectedSampleAlgoRNum4Inorder[1 .. $]); 3480 3481 /* Bernoulli sampling cases. */ 3482 testTsvSample(["test-c11", "--header", "--static-seed", "--print-random", "--prob", ".5", 3483 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3484 combo1ExpectedBernoulliCompatP50Probs); 3485 testTsvSample(["test-c12", "--header", "--static-seed", "--prob", ".4", 3486 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3487 combo1ExpectedBernoulliCompatP40); 3488 testTsvSample(["test-c13", "--static-seed", "--print-random", "--prob", ".5", 3489 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3490 fpath_data3x6_noheader, fpath_data3x2_noheader], 3491 combo1ExpectedBernoulliCompatP50Probs[1 .. $]); 3492 testTsvSample(["test-c14", "--static-seed", "--prob", ".4", 3493 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3494 fpath_data3x6_noheader, fpath_data3x2_noheader], 3495 combo1ExpectedBernoulliCompatP40[1 .. $]); 3496 3497 /* Bernoulli sampling with probabilities in skip sampling range. */ 3498 testTsvSample(["test-cc1", "-H", "-v", "333", "-p", "0.03", 3499 fpath_data3x0, fpath_data3x1, fpath_data1x200, fpath_dataEmpty, fpath_data1x10], 3500 combo2ExpectedBernoulliSkipV333P03); 3501 testTsvSample(["test-cc1", "-v", "333", "-p", "0.03", 3502 fpath_data3x1_noheader, fpath_data1x200_noheader, fpath_dataEmpty, fpath_data1x10_noheader], 3503 combo2ExpectedBernoulliSkipV333P03[1 .. $]); 3504 3505 /* Distinct sampling cases. */ 3506 testTsvSample(["test-c13", "--header", "--static-seed", "--key-fields", "1", "--prob", ".4", 3507 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3508 combo1ExpectedDistinctK1P40); 3509 testTsvSample(["test-c14", "--static-seed", "--key-fields", "1", "--prob", ".4", 3510 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3511 fpath_data3x6_noheader, fpath_data3x2_noheader], 3512 combo1ExpectedDistinctK1P40[1 .. $]); 3513 3514 /* Generating random weights. */ 3515 testTsvSample(["test-c15", "--header", "--static-seed", "--gen-random-inorder", 3516 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3517 combo1ExpectedProbsInorder); 3518 testTsvSample(["test-c16", "--static-seed", "--gen-random-inorder", 3519 fpath_data3x3_noheader, fpath_data3x1_noheader, 3520 fpath_dataEmpty, fpath_data3x6_noheader, fpath_data3x2_noheader], 3521 combo1ExpectedProbsInorder[1 .. $]); 3522 3523 /* Simple random sampling with replacement. */ 3524 testTsvSample(["test-c17", "--header", "--static-seed", "--replace", "--num", "10", 3525 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3526 combo1ExpectedReplaceNum10); 3527 3528 testTsvSample(["test-c18", "--static-seed", "--replace", "--num", "10", 3529 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3530 fpath_data3x6_noheader, fpath_data3x2_noheader], 3531 combo1ExpectedReplaceNum10[1 .. $]); 3532 3533 /* Single column file. */ 3534 testTsvSample(["test-d1", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat); 3535 testTsvSample(["test-d2", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat); 3536 3537 /* Distributions. */ 3538 testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10a], data2x10aExpectedPermuteWt2Probs); 3539 testTsvSample(["test-e2", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10b], data2x10bExpectedPermuteWt2Probs); 3540 testTsvSample(["test-e3", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10c], data2x10cExpectedPermuteWt2Probs); 3541 testTsvSample(["test-e4", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10d], data2x10dExpectedPermuteWt2Probs); 3542 testTsvSample(["test-e5", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10e], data2x10eExpectedPermuteWt2Probs); 3543 3544 /* Tests of subset sample (--n|num) field. Random sampling, Bernoulli sampling, distinct sampling. 3545 * 3546 * Note: The way these tests are done ensures that subset length does not affect 3547 * output order. 3548 */ 3549 import std.algorithm : min; 3550 for (size_t n = data3x6.length + 2; n >= 1; n--) 3551 { 3552 /* reservoirSamplingViaHeap. 3553 */ 3554 size_t expectedLength = min(data3x6.length, n + 1); 3555 testTsvSample([format("test-f1_%d", n), "-s", "-n", n.to!string, 3556 "-H", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]); 3557 3558 testTsvSample([format("test-f2_%d", n), "-s", "-n", n.to!string, 3559 "-H", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]); 3560 3561 testTsvSample([format("test-f3_%d", n), "-s", "-n", n.to!string, 3562 "-H", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs[0..expectedLength]); 3563 3564 testTsvSample([format("test-f4_%d", n), "-s", "-n", n.to!string, 3565 "-H", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3[0..expectedLength]); 3566 3567 testTsvSample([format("test-f5_%d", n), "-s", "-n", n.to!string, 3568 "-H", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs[0..expectedLength]); 3569 3570 testTsvSample([format("test-f6_%d", n), "-s", "-n", n.to!string, 3571 fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1..expectedLength]); 3572 3573 testTsvSample([format("test-f7_%d", n), "-s", "-n", n.to!string, 3574 "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1..expectedLength]); 3575 3576 testTsvSample([format("test-f8_%d", n), "-s", "-n", n.to!string, 3577 "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..expectedLength]); 3578 3579 testTsvSample([format("test-f9_%d", n), "-s", "-n", n.to!string, 3580 "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..expectedLength]); 3581 3582 /* Bernoulli sampling. 3583 */ 3584 import std.algorithm : min; 3585 size_t sampleExpectedLength = min(expectedLength, data3x6ExpectedBernoulliCompatProbsP60.length); 3586 3587 testTsvSample([format("test-f10_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 3588 "-H", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60[0..sampleExpectedLength]); 3589 3590 testTsvSample([format("test-f11_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 3591 "-H", fpath_data3x6], data3x6ExpectedBernoulliCompatP60[0..sampleExpectedLength]); 3592 3593 testTsvSample([format("test-f12_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 3594 "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1..sampleExpectedLength]); 3595 3596 testTsvSample([format("test-f13_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 3597 fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60[1..sampleExpectedLength]); 3598 3599 /* Distinct Sampling. 3600 */ 3601 size_t distinctExpectedLength = min(expectedLength, data3x6ExpectedDistinctK1K3P60.length); 3602 3603 testTsvSample([format("test-f14_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string, 3604 "-H", fpath_data3x6], data3x6ExpectedDistinctK1K3P60[0..distinctExpectedLength]); 3605 3606 testTsvSample([format("test-f15_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string, 3607 fpath_data3x6_noheader], data3x6ExpectedDistinctK1K3P60[1..distinctExpectedLength]); 3608 3609 testTsvSample([format("test-f16_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string, 3610 "-H", fpath_data3x6], data3x6ExpectedBernoulliProbsP100[0..expectedLength]); 3611 3612 testTsvSample([format("test-f17_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string, 3613 fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..expectedLength]); 3614 } 3615 3616 /* Similar tests with the 1x10 data set. */ 3617 for (size_t n = data1x10.length + 2; n >= 1; n--) 3618 { 3619 size_t expectedLength = min(data1x10.length, n + 1); 3620 testTsvSample([format("test-g1_%d", n), "-s", "-n", n.to!string, 3621 "-H", fpath_data1x10], data1x10ExpectedPermuteCompat[0..expectedLength]); 3622 3623 testTsvSample([format("test-g2_%d", n), "-s", "-n", n.to!string, 3624 "-H", "-w", "1", fpath_data1x10], data1x10ExpectedPermuteWt1[0..expectedLength]); 3625 3626 testTsvSample([format("test-g3_%d", n), "-s", "-n", n.to!string, 3627 fpath_data1x10_noheader], data1x10ExpectedPermuteCompat[1..expectedLength]); 3628 3629 testTsvSample([format("test-g4_%d", n), "-s", "-n", n.to!string, 3630 "-w", "1", fpath_data1x10_noheader], data1x10ExpectedPermuteWt1[1..expectedLength]); 3631 } 3632 3633 /* Simple random sampling with replacement: ensure sample size doesn't change order. */ 3634 for (size_t n = data3x6ExpectedReplaceNum10.length - 1; n >= 1; n--) 3635 { 3636 testTsvSample([format("test-h1_%d", n), "-s", "--replace", "-n", n.to!string, "-H", fpath_data3x6], 3637 data3x6ExpectedReplaceNum10[0 .. n + 1]); 3638 3639 testTsvSample([format("test-h2_%d", n), "-s", "--replace", "-n", n.to!string, fpath_data3x6_noheader], 3640 data3x6ExpectedReplaceNum10[1 .. n + 1]); 3641 } 3642 3643 /* Bernoulli skip sampling. Test with lengths both greater than and less than expected. */ 3644 for (size_t n = data1x200ExpectedBernoulliSkipV333P03.length + 2; n >= 1; n--) 3645 { 3646 size_t expectedLength = min(data1x200ExpectedBernoulliSkipV333P03.length, n + 1); 3647 3648 testTsvSample([format("test-i1_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string, 3649 "-H", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03[0..expectedLength]); 3650 3651 testTsvSample([format("test-i2_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string, 3652 fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..expectedLength]); 3653 } 3654 3655 /* Inorder sampling tests using reservoir sampling via heap (compatibility mode). */ 3656 testTsvSample(["test-ar10", "--compatibility-mode", "--header", "--static-seed", "--num", "1", "--inorder", fpath_dataEmpty], dataEmpty); 3657 testTsvSample(["test-ar11", "--compatibility-mode", "--header", "--static-seed", "--num", "2", "--inorder", fpath_dataEmpty], dataEmpty); 3658 testTsvSample(["test-ar12", "--compatibility-mode", "-H", "-s", "--num", "1", "--inorder", fpath_data3x0], data3x0); 3659 testTsvSample(["test-ar13", "--compatibility-mode", "-H", "-s", "--num", "2", "--inorder", fpath_data3x0], data3x0); 3660 testTsvSample(["test-ar14", "--compatibility-mode", "-H", "-s", "--num", "1", "--inorder", fpath_data3x1], data3x1); 3661 testTsvSample(["test-ar15", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", fpath_data3x1], data3x1); 3662 testTsvSample(["test-ar16", "--compatibility-mode", "-H", "-s", "--num", "7", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum6Inorder); 3663 testTsvSample(["test-ar17", "--compatibility-mode", "-H", "-s", "--num", "6", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum6Inorder); 3664 testTsvSample(["test-ar18", "--compatibility-mode", "-H", "-s", "--num", "5", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum5Inorder); 3665 testTsvSample(["test-ar19", "--compatibility-mode", "-H", "-s", "--num", "4", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum4Inorder); 3666 testTsvSample(["test-ar20", "--compatibility-mode", "-H", "-s", "--num", "3", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum3Inorder); 3667 testTsvSample(["test-ar21", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum2Inorder); 3668 testTsvSample(["test-ar22", "--compatibility-mode", "-H", "-s", "--num", "1", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum1Inorder); 3669 3670 testTsvSample(["test-as10", "--compatibility-mode", "--static-seed", "--num", "1", "-i", fpath_dataEmpty], dataEmpty); 3671 testTsvSample(["test-as11", "--compatibility-mode", "--static-seed", "--num", "2", "-i", fpath_dataEmpty], dataEmpty); 3672 testTsvSample(["test-as14", "--compatibility-mode", "-s", "--num", "1", "-i", fpath_data3x1_noheader], data3x1[1 .. $]); 3673 testTsvSample(["test-as15", "--compatibility-mode", "-s", "--num", "2", "-i", fpath_data3x1_noheader], data3x1[1 .. $]); 3674 testTsvSample(["test-as16", "--compatibility-mode", "-s", "--num", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6Inorder[1 .. $]); 3675 testTsvSample(["test-as17", "--compatibility-mode", "-s", "--num", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6Inorder[1 .. $]); 3676 testTsvSample(["test-as18", "--compatibility-mode", "-s", "--num", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum5Inorder[1 .. $]); 3677 testTsvSample(["test-as19", "--compatibility-mode", "-s", "--num", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4Inorder[1 .. $]); 3678 testTsvSample(["test-as20", "--compatibility-mode", "-s", "--num", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum3Inorder[1 .. $]); 3679 testTsvSample(["test-as21", "--compatibility-mode", "-s", "--num", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum2Inorder[1 .. $]); 3680 testTsvSample(["test-as22", "--compatibility-mode", "-s", "--num", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum1Inorder[1 .. $]); 3681 3682 /* Inorder sampling tests with random number printing. --compatibility-mode not needed. */ 3683 testTsvSample(["test-at16", "--compatibility-mode", "-H", "-s", "--num", "7", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum6ProbsInorder); 3684 testTsvSample(["test-at17", "--compatibility-mode", "-H", "-s", "--num", "6", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum6ProbsInorder); 3685 testTsvSample(["test-at18", "--compatibility-mode", "-H", "-s", "--num", "5", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum5ProbsInorder); 3686 testTsvSample(["test-at19", "--compatibility-mode", "-H", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum4ProbsInorder); 3687 testTsvSample(["test-at19", "-H", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum4ProbsInorder); 3688 testTsvSample(["test-at20", "--compatibility-mode", "-H", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum3ProbsInorder); 3689 testTsvSample(["test-at20", "-H", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum3ProbsInorder); 3690 testTsvSample(["test-at21", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum2ProbsInorder); 3691 testTsvSample(["test-at22", "--compatibility-mode", "-H", "-s", "--num", "1", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum1ProbsInorder); 3692 3693 testTsvSample(["test-au16", "--compatibility-mode", "-s", "--num", "7", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6ProbsInorder[1 .. $]); 3694 testTsvSample(["test-au17", "--compatibility-mode", "-s", "--num", "6", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6ProbsInorder[1 .. $]); 3695 testTsvSample(["test-au18", "--compatibility-mode", "-s", "--num", "5", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum5ProbsInorder[1 .. $]); 3696 testTsvSample(["test-au19", "--compatibility-mode", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4ProbsInorder[1 .. $]); 3697 testTsvSample(["test-au19", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4ProbsInorder[1 .. $]); 3698 testTsvSample(["test-au20", "--compatibility-mode", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum3ProbsInorder[1 .. $]); 3699 testTsvSample(["test-au21", "--compatibility-mode", "-s", "--num", "2", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum2ProbsInorder[1 .. $]); 3700 testTsvSample(["test-au22", "--compatibility-mode", "-s", "--num", "1", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum1ProbsInorder[1 .. $]); 3701 3702 /* Inorder weighted sampling tests. */ 3703 testTsvSample(["test-ax16", "-H", "-s", "-n", "7", "-i", fpath_data3x6], data3x6ExpectedWt3Num6Inorder); 3704 testTsvSample(["test-ax17", "-H", "-s", "-n", "6", "-i", fpath_data3x6], data3x6ExpectedWt3Num6Inorder); 3705 testTsvSample(["test-ax18", "-H", "-s", "-n", "5", "-i", fpath_data3x6], data3x6ExpectedWt3Num5Inorder); 3706 testTsvSample(["test-ax19", "-H", "-s", "-n", "4", "-i", fpath_data3x6], data3x6ExpectedWt3Num4Inorder); 3707 testTsvSample(["test-ax20", "-H", "-s", "-n", "3", "-i", fpath_data3x6], data3x6ExpectedWt3Num3Inorder); 3708 testTsvSample(["test-ax21", "-H", "-s", "-n", "2", "-i", fpath_data3x6], data3x6ExpectedWt3Num2Inorder); 3709 testTsvSample(["test-ax22", "-H", "-s", "-n", "1", "-i", fpath_data3x6], data3x6ExpectedWt3Num1Inorder); 3710 3711 testTsvSample(["test-ay16", "-s", "-n", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num6Inorder[1 .. $]); 3712 testTsvSample(["test-ay17", "-s", "-n", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num6Inorder[1 .. $]); 3713 testTsvSample(["test-ay18", "-s", "-n", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num5Inorder[1 .. $]); 3714 testTsvSample(["test-ay19", "-s", "-n", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num4Inorder[1 .. $]); 3715 testTsvSample(["test-ay20", "-s", "-n", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num3Inorder[1 .. $]); 3716 testTsvSample(["test-ay21", "-s", "-n", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num2Inorder[1 .. $]); 3717 testTsvSample(["test-ay22", "-s", "-n", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num1Inorder[1 .. $]); 3718 3719 /* 3720 * Distinct sampling tests. 3721 */ 3722 testTsvSample(["test-j1", "--header", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25], 3723 data5x25ExpectedDistinctK2P40); 3724 3725 testTsvSample(["test-j2", "-H", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25], 3726 data5x25ExpectedDistinctK2K4P20); 3727 3728 testTsvSample(["test-j3", "-H", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25], 3729 data5x25ExpectedDistinctK2K3K4P20); 3730 3731 testTsvSample(["test-j4", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25_noheader], 3732 data5x25ExpectedDistinctK2P40[1 .. $]); 3733 3734 testTsvSample(["test-j5", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25_noheader], 3735 data5x25ExpectedDistinctK2K4P20[1 .. $]); 3736 3737 testTsvSample(["test-j6", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25_noheader], 3738 data5x25ExpectedDistinctK2K3K4P20[1 .. $]); 3739 3740 3741 /* These distinct tests check that the whole line as '-k 0' and specifying all fields 3742 * in order have the same result. Also that field numbers don't matter, as '-k 1,2' 3743 * in data2x25 are the same keys as '-k 2,4' in data5x25. 3744 */ 3745 testTsvSample(["test-j7", "-H", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25], 3746 data2x25ExpectedDistinctK1K2P20); 3747 3748 testTsvSample(["test-j8", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data2x25], 3749 data2x25ExpectedDistinctK1K2P20); 3750 3751 testTsvSample(["test-j9", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25_noheader], 3752 data2x25ExpectedDistinctK1K2P20[1 .. $]); 3753 3754 testTsvSample(["test-j10", "-s", "-p", "0.20", "-k", "0", fpath_data2x25_noheader], 3755 data2x25ExpectedDistinctK1K2P20[1 .. $]); 3756 3757 /* Similar to the last set, but for a 1-column file. Also with random value printing. */ 3758 testTsvSample(["test-j11", "-H", "-s", "-p", "0.20", "-k", "1", fpath_data1x25], 3759 data1x25ExpectedDistinctK1P20); 3760 3761 testTsvSample(["test-j12", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data1x25], 3762 data1x25ExpectedDistinctK1P20); 3763 3764 testTsvSample(["test-j13", "-s", "-p", "0.20", "-k", "1", fpath_data1x25_noheader], 3765 data1x25ExpectedDistinctK1P20[1 .. $]); 3766 3767 testTsvSample(["test-j14", "-s", "-p", "0.20", "-k", "0", fpath_data1x25_noheader], 3768 data1x25ExpectedDistinctK1P20[1 .. $]); 3769 3770 3771 testTsvSample(["test-j15", "-H", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25], 3772 data1x25ExpectedDistinctK1P20Probs); 3773 3774 testTsvSample(["test-j16", "-H", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25], 3775 data1x25ExpectedDistinctK1P20Probs); 3776 3777 testTsvSample(["test-j17", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25_noheader], 3778 data1x25ExpectedDistinctK1P20Probs[1 .. $]); 3779 3780 testTsvSample(["test-j18", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25_noheader], 3781 data1x25ExpectedDistinctK1P20Probs[1 .. $]); 3782 3783 3784 testTsvSample(["test-j19", "-H", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25], 3785 data1x25ExpectedDistinctK1P20ProbsInorder); 3786 3787 testTsvSample(["test-j20", "-H", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25], 3788 data1x25ExpectedDistinctK1P20ProbsInorder); 3789 3790 testTsvSample(["test-j21", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25_noheader], 3791 data1x25ExpectedDistinctK1P20ProbsInorder[1 .. $]); 3792 3793 testTsvSample(["test-j22", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25_noheader], 3794 data1x25ExpectedDistinctK1P20ProbsInorder[1 .. $]); 3795 3796 }