1 /** 2 Command line tool for shuffling or sampling lines from input streams. Several methods 3 are available, including weighted and unweighted shuffling, simple and weighted random 4 sampling, sampling with replacement, Bernoulli sampling, and distinct sampling. 5 6 Copyright (c) 2017-2020, eBay Inc. 7 Initially written by Jon Degenhardt 8 9 License: Boost License 1.0 (http://boost.org/LICENSE_1_0.txt) 10 */ 11 module tsv_utils.tsv_sample; 12 13 import std.array : appender, Appender, RefAppender; 14 import std.exception : enforce; 15 import std.format : format; 16 import std.range; 17 import std.stdio; 18 import std.typecons : tuple, Flag; 19 20 static if (__VERSION__ >= 2085) extern(C) __gshared string[] rt_options = [ "gcopt=cleanup:none" ]; 21 22 version(unittest) 23 { 24 // When running unit tests, use main from -main compiler switch. 25 } 26 else 27 { 28 /** Main program. 29 * 30 * Invokes command line argument processing and calls tsvSample to do the real 31 * work. Errors occurring during processing are caught and reported to the user. 32 */ 33 int main(string[] cmdArgs) 34 { 35 /* When running in DMD code coverage mode, turn on report merging. */ 36 version(D_Coverage) version(DigitalMars) 37 { 38 import core.runtime : dmd_coverSetMerge; 39 dmd_coverSetMerge(true); 40 } 41 42 TsvSampleOptions cmdopt; 43 const r = cmdopt.processArgs(cmdArgs); 44 if (!r[0]) return r[1]; 45 version(LDC_Profile) 46 { 47 import ldc.profile : resetAll; 48 resetAll(); 49 } 50 try 51 { 52 import tsv_utils.common.utils : BufferedOutputRange; 53 auto bufferedOutput = BufferedOutputRange!(typeof(stdout))(stdout); 54 55 tsvSample(cmdopt, bufferedOutput); 56 } 57 catch (Exception exc) 58 { 59 stderr.writefln("Error [%s]: %s", cmdopt.programName, exc.msg); 60 return 1; 61 } 62 return 0; 63 } 64 } 65 66 immutable helpText = q"EOS 67 Synopsis: tsv-sample [options] [file...] 68 69 Sample input lines or randomize their order. Several modes of operation 70 are available: 71 * Shuffling (the default): All input lines are output in random order. All 72 orderings are equally likely. 73 * Random sampling (--n|num N): A random sample of N lines are selected and 74 written to standard output. By default, selected lines are written in 75 random order. All sample sets and orderings are equally likely. Use 76 --i|inorder to write the selected lines in the original input order. 77 * Weighted random sampling (--n|num N, --w|weight-field F): A weighted 78 sample of N lines is produced. Weights are taken from field F. Lines are 79 output in weighted selection order. Use --i|inorder to write in original 80 input order. Omit --n|num to shuffle all lines (weighted shuffling). 81 * Sampling with replacement (--r|replace, --n|num N): All input lines are 82 read in, then lines are repeatedly selected at random and written out. 83 This continues until N lines are output. Individual lines can be written 84 multiple times. Output continues forever if N is zero or not provided. 85 * Bernoulli sampling (--p|prob P): A random subset of lines is selected 86 based on probability P, a 0.0-1.0 value. This is a streaming operation. 87 A decision is made on each line as it is read. Line order is not changed. 88 * Distinct sampling (--k|key-fields F, --p|prob P): Input lines are sampled 89 based on the values in the key fields. A subset of keys are chosen based 90 on the inclusion probability (a 'distinct' set of keys). All lines with 91 one of the selected keys are output. Line order is not changed. 92 93 Use '--help-verbose' for detailed information. 94 95 Options: 96 EOS"; 97 98 immutable helpTextVerbose = q"EOS 99 Synopsis: tsv-sample [options] [file...] 100 101 Sample input lines or randomize their order. Several modes of operation 102 are available: 103 * Shuffling (the default): All input lines are output in random order. All 104 orderings are equally likely. 105 * Random sampling (--n|num N): A random sample of N lines are selected and 106 written to standard output. By default, selected lines are written in 107 random order. All sample sets and orderings are equally likely. Use 108 --i|inorder to write the selected lines in the original input order. 109 * Weighted random sampling (--n|num N, --w|weight-field F): A weighted 110 sample of N lines is produced. Weights are taken from field F. Lines are 111 output in weighted selection order. Use --i|inorder to write in original 112 input order. Omit --n|num to shuffle all lines (weighted shuffling). 113 * Sampling with replacement (--r|replace, --n|num N): All input lines are 114 read in, then lines are repeatedly selected at random and written out. 115 This continues until N lines are output. Individual lines can be written 116 multiple times. Output continues forever if N is zero or not provided. 117 * Bernoulli sampling (--p|prob P): A random subset of lines is selected 118 based on probability P, a 0.0-1.0 value. This is a streaming operation. 119 A decision is made on each line as it is read. Line order is not changed. 120 * Distinct sampling (--k|key-fields F, --p|prob P): Input lines are sampled 121 based on the values in the key fields. A subset of keys are chosen based 122 on the inclusion probability (a 'distinct' set of keys). All lines with 123 one of the selected keys are output. Line order is not changed. 124 125 Sample size: The '--n|num' option controls the sample size for all 126 sampling methods. In the case of simple and weighted random sampling it 127 also limits the amount of memory required. 128 129 Controlling the random seed: By default, each run produces a different 130 randomization or sampling. Using '--s|static-seed' changes this so 131 multiple runs produce the same results. This works by using the same 132 random seed each run. The random seed can be specified using 133 '--v|seed-value'. This takes a non-zero, 32-bit positive integer. (A zero 134 value is a no-op and ignored.) 135 136 Memory use: Bernoulli sampling and distinct sampling make decisions on 137 each line as it is read, there is no memory accumulation. These algorithms 138 can run on arbitrary size inputs. Sampling with replacement reads all 139 lines into memory and is limited by available memory. Shuffling also reads 140 all lines into memory and is similarly limited. Random sampling uses 141 reservoir sampling, and only needs to hold the sample size (--n|num) in 142 memory. The input data can be of any length. 143 144 Weighted sampling: Weighted random sampling is done using an algorithm 145 described by Pavlos Efraimidis and Paul Spirakis. Weights should be 146 positive values representing the relative weight of the entry in the 147 collection. Counts and similar can be used as weights, it is *not* 148 necessary to normalize to a [0,1] interval. Negative values are not 149 meaningful and given the value zero. Input order is not retained, instead 150 lines are output ordered by the randomized weight that was assigned. This 151 means that a smaller valid sample can be produced by taking the first N 152 lines of output. For more info on the sampling approach see: 153 * Wikipedia: https://en.wikipedia.org/wiki/Reservoir_sampling 154 * "Weighted Random Sampling over Data Streams", Pavlos S. Efraimidis 155 (https://arxiv.org/abs/1012.0256) 156 157 Printing random values: Most of the sampling algorithms work by generating 158 a random value for each line. (See "Compatibility mode" below.) The nature 159 of these values depends on the sampling algorithm. They are used for both 160 line selection and output ordering. The '--p|print-random' option can be 161 used to print these values. The random value is prepended to the line 162 separated by the --d|delimiter char (TAB by default). The 163 '--gen-random-inorder' option takes this one step further, generating 164 random values for all input lines without changing the input order. The 165 types of values currently used by these sampling algorithms: 166 * Unweighted sampling: Uniform random value in the interval [0,1]. This 167 includes Bernoulli sampling and unweighted line order randomization. 168 * Weighted sampling: Value in the interval [0,1]. Distribution depends on 169 the values in the weight field. It is used as a partial ordering. 170 * Distinct sampling: An integer, zero and up, representing a selection 171 group. The inclusion probability determines the number of selection groups. 172 * Sampling with replacement: Random value printing is not supported. 173 174 The specifics behind these random values are subject to change in future 175 releases. 176 177 Compatibility mode: As described above, many of the sampling algorithms 178 assign a random value to each line. This is useful when printing random 179 values. It has another occasionally useful property: repeated runs with 180 the same static seed but different selection parameters are more 181 compatible with each other, as each line gets assigned the same random 182 value on every run. For example, if Bernoulli sampling is run with 183 '--prob 0.2 --static-seed', then run again with '--prob 0.3 --static-seed', 184 all the lines selected in the first run will be selected in the second. 185 This comes at a cost: in some cases there are faster algorithms that don't 186 preserve this property. By default, tsv-sample will use faster algorithms 187 when available. However, the '--compatibility-mode' option switches to 188 algorithms that assign a random value per line. Printing random values 189 also engages compatibility mode. 190 191 Options: 192 EOS"; 193 194 /** Container for command line options and derived data. 195 * 196 * TsvSampleOptions handles several aspects of command line options. On the input side, 197 * it defines the command line options available, performs validation, and sets up any 198 * derived state based on the options provided. These activities are handled by the 199 * processArgs() member. 200 * 201 * Once argument processing is complete, TsvSampleOptions is used as a container 202 * holding the specific processing options used by the different sampling routines. 203 */ 204 struct TsvSampleOptions 205 { 206 import tsv_utils.common.utils : InputSourceRange; 207 208 string programName; /// Program name 209 InputSourceRange inputSources; /// Input files 210 bool helpVerbose = false; /// --help-verbose 211 bool hasHeader = false; /// --H|header 212 ulong sampleSize = 0; /// --n|num - Size of the desired sample 213 double inclusionProbability = double.nan; /// --p|prob - Inclusion probability 214 size_t[] keyFields; /// --k|key-fields - Used with inclusion probability 215 size_t weightField = 0; /// --w|weight-field - Field holding the weight 216 bool srsWithReplacement = false; /// --r|replace 217 bool preserveInputOrder = false; /// --i|inorder 218 bool staticSeed = false; /// --s|static-seed 219 uint seedValueOptionArg = 0; /// --v|seed-value 220 bool printRandom = false; /// --print-random 221 bool genRandomInorder = false; /// --gen-random-inorder 222 string randomValueHeader = "random_value"; /// --random-value-header 223 bool compatibilityMode = false; /// --compatibility-mode 224 char delim = '\t'; /// --d|delimiter 225 bool versionWanted = false; /// --V|version 226 bool preferSkipSampling = false; /// --prefer-skip-sampling 227 bool preferAlgorithmR = false; /// --prefer-algorithm-r 228 bool hasWeightField = false; /// Derived. 229 bool useBernoulliSampling = false; /// Derived. 230 bool useDistinctSampling = false; /// Derived. 231 bool distinctKeyIsFullLine = false; /// Derived. True if '--k|key-fields 0' is specfied. 232 bool usingUnpredictableSeed = true; /// Derived from --static-seed, --seed-value 233 uint seed = 0; /// Derived from --static-seed, --seed-value 234 235 /** Process tsv-sample command line arguments. 236 * 237 * Defines the command line options, performs validation, and derives additional 238 * state. std.getopt.getopt is called to do the main option processing followed 239 * additional validation and derivation. 240 * 241 * Help text is printed to standard output if help was requested. Error text is 242 * written to stderr if invalid input is encountered. 243 * 244 * A tuple is returned. First value is true if command line arguments were 245 * successfully processed and execution should continue, or false if an error 246 * occurred or the user asked for help. If false, the second value is the 247 * appropriate exit code (0 or 1). 248 * 249 * Returning true (execution continues) means args have been validated and derived 250 * values calculated. Field indices will have been converted to zero-based. 251 */ 252 auto processArgs(ref string[] cmdArgs) 253 { 254 import std.algorithm : all, canFind, each; 255 import std.getopt; 256 import std.math : isNaN; 257 import std.path : baseName, stripExtension; 258 import std.typecons : Yes, No; 259 import tsv_utils.common.utils : inputSourceRange, makeFieldListOptionHandler, ReadHeader; 260 261 programName = (cmdArgs.length > 0) ? cmdArgs[0].stripExtension.baseName : "Unknown_program_name"; 262 263 try 264 { 265 arraySep = ","; // Use comma to separate values in command line options 266 auto r = getopt( 267 cmdArgs, 268 "help-verbose", " Print more detailed help.", &helpVerbose, 269 270 std.getopt.config.caseSensitive, 271 "H|header", " Treat the first line of each file as a header.", &hasHeader, 272 std.getopt.config.caseInsensitive, 273 274 "n|num", "NUM Maximum number of lines to output. All selected lines are output if not provided or zero.", &sampleSize, 275 "p|prob", "NUM Inclusion probability (0.0 < NUM <= 1.0). For Bernoulli sampling, the probability each line is selected output. For distinct sampling, the probability each unique key is selected for output.", &inclusionProbability, 276 277 "k|key-fields", "<field-list> Fields to use as key for distinct sampling. Use with '--p|prob'. Specify '--k|key-fields 0' to use the entire line as the key.", 278 keyFields.makeFieldListOptionHandler!(size_t, No.convertToZeroBasedIndex, Yes.allowFieldNumZero), 279 280 "w|weight-field", "NUM Field containing weights. All lines get equal weight if not provided or zero.", &weightField, 281 "r|replace", " Simple random sampling with replacement. Use --n|num to specify the sample size.", &srsWithReplacement, 282 "i|inorder", " Output random samples in original input order. Requires use of --n|num.", &preserveInputOrder, 283 "s|static-seed", " Use the same random seed every run.", &staticSeed, 284 285 std.getopt.config.caseSensitive, 286 "v|seed-value", "NUM Sets the random seed. Use a non-zero, 32 bit positive integer. Zero is a no-op.", &seedValueOptionArg, 287 std.getopt.config.caseInsensitive, 288 289 "print-random", " Include the assigned random value (prepended) when writing output lines.", &printRandom, 290 "gen-random-inorder", " Output all lines with assigned random values prepended, no changes to the order of input.", &genRandomInorder, 291 "random-value-header", " Header to use with --print-random and --gen-random-inorder. Default: 'random_value'.", &randomValueHeader, 292 "compatibility-mode", " Turns on 'compatibility-mode'. Use --help-verbose for information.", &compatibilityMode, 293 294 "d|delimiter", "CHR Field delimiter.", &delim, 295 296 std.getopt.config.caseSensitive, 297 "V|version", " Print version information and exit.", &versionWanted, 298 std.getopt.config.caseInsensitive, 299 300 "prefer-skip-sampling", " (Internal) Prefer the skip-sampling algorithm for Bernoulli sampling. Used for testing and diagnostics.", 301 &preferSkipSampling, 302 303 "prefer-algorithm-r", " (Internal) Prefer Algorithm R for unweighted line order randomization. Used for testing and diagnostics.", 304 &preferAlgorithmR, 305 ); 306 307 if (r.helpWanted) 308 { 309 defaultGetoptPrinter(helpText, r.options); 310 return tuple(false, 0); 311 } 312 else if (helpVerbose) 313 { 314 defaultGetoptPrinter(helpTextVerbose, r.options); 315 return tuple(false, 0); 316 } 317 else if (versionWanted) 318 { 319 import tsv_utils.common.tsvutils_version; 320 writeln(tsvutilsVersionNotice("tsv-sample")); 321 return tuple(false, 0); 322 } 323 324 /* Input files. Remaining command line args are files. */ 325 string[] filepaths = (cmdArgs.length > 1) ? cmdArgs[1 .. $] : ["-"]; 326 cmdArgs.length = 1; 327 ReadHeader readHeader = hasHeader ? Yes.readHeader : No.readHeader; 328 inputSources = inputSourceRange(filepaths, readHeader); 329 330 /* Derivations and validations. */ 331 if (weightField > 0) 332 { 333 hasWeightField = true; 334 weightField--; // Switch to zero-based indexes. 335 } 336 337 if (srsWithReplacement) 338 { 339 enforce(!hasWeightField, 340 "Sampling with replacement (--r|replace) does not support weights (--w|weight-field)."); 341 342 enforce(inclusionProbability.isNaN, 343 "Sampling with replacement (--r|replace) cannot be used with probabilities (--p|prob)."); 344 345 enforce(keyFields.length == 0, 346 "Sampling with replacement (--r|replace) cannot be used with distinct sampling (--k|key-fields)."); 347 348 enforce(!printRandom && !genRandomInorder, 349 "Sampling with replacement (--r|replace) does not support random value printing (--print-random, --gen-random-inorder)."); 350 351 enforce(!preserveInputOrder, 352 "Sampling with replacement (--r|replace) does not support input order preservation (--i|inorder option)."); 353 } 354 355 if (keyFields.length > 0) 356 { 357 /* Note: useDistinctSampling is set as part of the inclusion probability checks below. */ 358 359 enforce(!inclusionProbability.isNaN, "--p|prob is required when using --k|key-fields."); 360 361 if (keyFields.length == 1 && keyFields[0] == 0) 362 { 363 distinctKeyIsFullLine = true; 364 } 365 else 366 { 367 enforce(keyFields.length <= 1 || keyFields.all!(x => x != 0), 368 "Whole line as key (--k|key-fields 0) cannot be combined with multiple fields."); 369 370 keyFields.each!((ref x) => --x); // Convert to zero-based indexing. 371 } 372 } 373 374 /* Inclusion probability (--p|prob) is used for both Bernoulli sampling and distinct sampling. */ 375 if (!inclusionProbability.isNaN) 376 { 377 enforce(inclusionProbability > 0.0 && inclusionProbability <= 1.0, 378 format("Invalid --p|prob option: %g. Must satisfy 0.0 < prob <= 1.0.", inclusionProbability)); 379 380 if (keyFields.length > 0) useDistinctSampling = true; 381 else useBernoulliSampling = true; 382 383 enforce(!hasWeightField, "--w|weight-field and --p|prob cannot be used together."); 384 385 enforce(!genRandomInorder || useDistinctSampling, 386 "--gen-random-inorder and --p|prob can only be used together if --k|key-fields is also used." ~ 387 "\nUse --gen-random-inorder alone to print probabilities for all lines." ~ 388 "\nUse --p|prob and --print-random to print probabilities for lines satisfying the probability threshold."); 389 } 390 else if (genRandomInorder && !hasWeightField) 391 { 392 useBernoulliSampling = true; 393 } 394 395 enforce(randomValueHeader.length != 0 && !randomValueHeader.canFind('\n') && 396 !randomValueHeader.canFind(delim), 397 "--randomValueHeader must be at least one character and not contain field delimiters or newlines."); 398 399 /* Check for incompatible use of (--i|inorder) and shuffling of the full 400 * data set. Sampling with replacement is also incompatible, this is 401 * detected earlier. Shuffling is the default operation, so it identified 402 * by eliminating the other modes of operation. 403 */ 404 enforce(!preserveInputOrder || 405 sampleSize != 0 || 406 useBernoulliSampling || 407 useDistinctSampling, 408 "Preserving input order (--i|inorder) is not compatible with full data set shuffling. Switch to random sampling with a sample size (--n|num) to use --i|inorder."); 409 410 /* Compatibility mode checks: 411 * - Random value printing implies compatibility-mode, otherwise user's 412 * selection is used. 413 * - Distinct sampling doesn't support compatibility-mode. The routines 414 * don't care, but users might expect larger probabilities to be a 415 * superset of smaller probabilities. This would be confusing, so 416 * flag it as an error. 417 */ 418 enforce(!(compatibilityMode && useDistinctSampling), 419 "Distinct sampling (--k|key-fields --p|prob) does not support --compatibility-mode."); 420 421 if (printRandom || genRandomInorder) compatibilityMode = true; 422 423 /* Seed. */ 424 import std.random : unpredictableSeed; 425 426 usingUnpredictableSeed = (!staticSeed && seedValueOptionArg == 0); 427 428 if (usingUnpredictableSeed) seed = unpredictableSeed; 429 else if (seedValueOptionArg != 0) seed = seedValueOptionArg; 430 else if (staticSeed) seed = 2438424139; 431 else assert(0, "Internal error, invalid seed option states."); 432 } 433 catch (Exception exc) 434 { 435 stderr.writefln("[%s] Error processing command line arguments: %s", programName, exc.msg); 436 return tuple(false, 1); 437 } 438 return tuple(true, 0); 439 } 440 } 441 /** Invokes the appropriate sampling routine based on the command line arguments. 442 * 443 * tsvSample is the top-level routine handling the different tsv-sample use cases. 444 * Its primary role is to invoke the correct routine for type of sampling requested. 445 */ 446 void tsvSample(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 447 if (isOutputRange!(OutputRange, char)) 448 { 449 if (cmdopt.srsWithReplacement) 450 { 451 simpleRandomSamplingWithReplacement(cmdopt, outputStream); 452 } 453 else if (cmdopt.useBernoulliSampling) 454 { 455 bernoulliSamplingCommand(cmdopt, outputStream); 456 } 457 else if (cmdopt.useDistinctSampling) 458 { 459 if (cmdopt.genRandomInorder) distinctSampling!(Yes.generateRandomAll)(cmdopt, outputStream); 460 else distinctSampling!(No.generateRandomAll)(cmdopt, outputStream); 461 } 462 else if (cmdopt.genRandomInorder) 463 { 464 /* Note that the preceding cases handle gen-random-inorder themselves (Bernoulli, 465 * Distinct), or don't handle it (SRS w/ Replacement). 466 */ 467 assert(cmdopt.hasWeightField); 468 generateWeightedRandomValuesInorder(cmdopt, outputStream); 469 } 470 else if (cmdopt.sampleSize != 0) 471 { 472 randomSamplingCommand(cmdopt, outputStream); 473 } 474 else 475 { 476 shuffleCommand(cmdopt, outputStream); 477 } 478 } 479 480 /** Bernoulli sampling command handler. Invokes the appropriate Bernoulli sampling 481 * routine based on the command line arguments. 482 * 483 * This routine selects the appropriate Bernoulli sampling function and template 484 * instantiation to use based on the command line arguments. 485 * 486 * One of the basic choices is whether to use the vanilla algorithm or skip sampling. 487 * Skip sampling is a little bit faster when the inclusion probability is small but 488 * doesn't support compatibility mode. See the bernoulliSkipSampling documentation 489 * for a discussion of the skipSamplingProbabilityThreshold used here. 490 */ 491 void bernoulliSamplingCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 492 if (isOutputRange!(OutputRange, char)) 493 { 494 assert(!cmdopt.hasWeightField); 495 496 immutable double skipSamplingProbabilityThreshold = 0.04; 497 498 if (cmdopt.compatibilityMode || 499 (cmdopt.inclusionProbability > skipSamplingProbabilityThreshold && !cmdopt.preferSkipSampling)) 500 { 501 if (cmdopt.genRandomInorder) 502 { 503 bernoulliSampling!(Yes.generateRandomAll)(cmdopt, outputStream); 504 } 505 else 506 { 507 bernoulliSampling!(No.generateRandomAll)(cmdopt, outputStream); 508 } 509 } 510 else 511 { 512 bernoulliSkipSampling(cmdopt, outputStream); 513 } 514 } 515 516 /** Bernoulli sampling of lines from the input stream. 517 * 518 * Each input line is a assigned a random value and output if less than 519 * cmdopt.inclusionProbability. The order of the lines is not changed. 520 * 521 * This routine supports random value printing and gen-random-inorder value printing. 522 */ 523 void bernoulliSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange) 524 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 525 if (isOutputRange!(OutputRange, char)) 526 { 527 import std.random : Random = Mt19937, uniform01; 528 import tsv_utils.common.utils : bufferedByLine, InputSourceRange, throwIfWindowsNewlineOnUnix; 529 530 static if (generateRandomAll) assert(cmdopt.genRandomInorder); 531 else assert(!cmdopt.genRandomInorder); 532 533 assert(!cmdopt.inputSources.empty); 534 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 535 536 auto randomGenerator = Random(cmdopt.seed); 537 538 /* First header is read during command line argument processing. */ 539 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 540 { 541 auto inputStream = cmdopt.inputSources.front; 542 throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 543 544 static if (generateRandomAll) 545 { 546 outputStream.put(cmdopt.randomValueHeader); 547 outputStream.put(cmdopt.delim); 548 } 549 else if (cmdopt.printRandom) 550 { 551 outputStream.put(cmdopt.randomValueHeader); 552 outputStream.put(cmdopt.delim); 553 } 554 555 outputStream.put(inputStream.header); 556 outputStream.put("\n"); 557 } 558 559 /* Process each line. */ 560 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 561 ulong numLinesWritten = 0; 562 563 foreach (inputStream; cmdopt.inputSources) 564 { 565 if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 566 567 foreach (ulong fileLineNum, line; 568 inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine)) 569 { 570 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum); 571 572 immutable double lineScore = uniform01(randomGenerator); 573 574 static if (generateRandomAll) 575 { 576 outputStream.formatRandomValue(lineScore); 577 outputStream.put(cmdopt.delim); 578 outputStream.put(line); 579 outputStream.put("\n"); 580 581 if (cmdopt.sampleSize != 0) 582 { 583 ++numLinesWritten; 584 if (numLinesWritten == cmdopt.sampleSize) return; 585 } 586 } 587 else if (lineScore < cmdopt.inclusionProbability) 588 { 589 if (cmdopt.printRandom) 590 { 591 outputStream.formatRandomValue(lineScore); 592 outputStream.put(cmdopt.delim); 593 } 594 outputStream.put(line); 595 outputStream.put("\n"); 596 597 if (cmdopt.sampleSize != 0) 598 { 599 ++numLinesWritten; 600 if (numLinesWritten == cmdopt.sampleSize) return; 601 } 602 } 603 } 604 } 605 } 606 607 /** bernoulliSkipSampling is an implementation of Bernoulli sampling using skips. 608 * 609 * Skip sampling works by skipping a random number of lines between selections. This 610 * can be faster than assigning a random value to each line when the inclusion 611 * probability is low, as it reduces the number of calls to the random number 612 * generator. Both the random number generator and the log() function are called when 613 * calculating the next skip size. These additional log() calls add up as the 614 * inclusion probability increases. 615 * 616 * Performance tests indicate the break-even point is about 4-5% (--prob 0.04) for 617 * file-oriented line sampling. This is obviously environment specific. In the 618 * environments this implementation has been tested in the performance improvements 619 * remain small, less than 7%, even with an inclusion probability as low as 0.0001. 620 * 621 * The algorithm does not assign random values to individual lines. This makes it 622 * incompatible with random value printing. It is not suitable for compatibility mode 623 * either. As an example, in compatibility mode a line selected with '--prob 0.2' should 624 * also be selected with '--prob 0.3' (assuming the same random seed). Skip sampling 625 * does not have this property. 626 * 627 * The algorithm for calculating the skip size has been described by multiple sources. 628 * There are two key variants depending on whether the total number of lines in the 629 * data set is known in advance. (This implementation does not know the total.) 630 * Useful references: 631 * $(LIST 632 * * Jeffrey Scott Vitter, "An Efficient Algorithm for Sequential Random Sampling", 633 * ACM Trans on Mathematical Software, 1987. On-line: 634 * http://www.ittc.ku.edu/~jsv/Papers/Vit87.RandomSampling.pdf 635 * * P.J. Haas, "Data-Stream Sampling: Basic Techniques and Results", from the book 636 * "Data Stream Management", Springer-Verlag, 2016. On-line: 637 * https://www.springer.com/cda/content/document/cda_downloaddocument/9783540286073-c2.pdf 638 * * Erik Erlandson, "Faster Random Samples With Gap Sampling", 2014. On-line: 639 * http://erikerlandson.github.io/blog/2014/09/11/faster-random-samples-with-gap-sampling/ 640 * ) 641 */ 642 void bernoulliSkipSampling(OutputRange)(ref TsvSampleOptions cmdopt, OutputRange outputStream) 643 if (isOutputRange!(OutputRange, char)) 644 { 645 import std.conv : to; 646 import std.math : log, trunc; 647 import std.random : Random = Mt19937, uniform01; 648 import tsv_utils.common.utils : bufferedByLine, InputSourceRange, throwIfWindowsNewlineOnUnix; 649 650 assert(cmdopt.inclusionProbability > 0.0 && cmdopt.inclusionProbability < 1.0); 651 assert(!cmdopt.printRandom); 652 assert(!cmdopt.compatibilityMode); 653 654 assert(!cmdopt.inputSources.empty); 655 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 656 657 auto randomGenerator = Random(cmdopt.seed); 658 659 immutable double discardRate = 1.0 - cmdopt.inclusionProbability; 660 immutable double logDiscardRate = log(discardRate); 661 662 /* Note: The '1.0 - uniform01(randomGenerator)' expression flips the half closed 663 * interval to (0.0, 1.0], excluding 0.0. 664 */ 665 size_t remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t; 666 667 /* First header is read during command line argument processing. */ 668 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 669 { 670 auto inputStream = cmdopt.inputSources.front; 671 throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 672 673 outputStream.put(inputStream.header); 674 outputStream.put("\n"); 675 } 676 677 /* Process each line. */ 678 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 679 ulong numLinesWritten = 0; 680 foreach (inputStream; cmdopt.inputSources) 681 { 682 if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 683 684 foreach (ulong fileLineNum, line; 685 inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine)) 686 { 687 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum); 688 689 if (remainingSkips > 0) 690 { 691 --remainingSkips; 692 } 693 else 694 { 695 outputStream.put(line); 696 outputStream.put("\n"); 697 698 if (cmdopt.sampleSize != 0) 699 { 700 ++numLinesWritten; 701 if (numLinesWritten == cmdopt.sampleSize) return; 702 } 703 704 remainingSkips = (log(1.0 - uniform01(randomGenerator)) / logDiscardRate).trunc.to!size_t; 705 } 706 } 707 } 708 } 709 710 /** Sample lines by choosing a random set of distinct keys formed from one or more 711 * fields on each line. 712 * 713 * Distinct sampling is a streaming form of sampling, similar to Bernoulli sampling. 714 * However, instead of each line being subject to an independent trial, lines are 715 * selected based on a key from each line. A portion of keys are randomly selected for 716 * output, and every line containing a selected key is included in the output. 717 * 718 * An example use-case is a query log having <user, query, clicked-url> triples. It is 719 * often useful to sample records for portion of the users, but including all records 720 * for the users selected. Distinct sampling supports this by selecting a subset of 721 * users to include in the output. 722 * 723 * Distinct sampling is done by hashing the key and mapping the hash value into 724 * buckets sized to hold the inclusion probability. Records having a key mapping to 725 * bucket zero are output. Buckets are equal size and therefore may be larger than the 726 * inclusion probability. (The other approach would be to have the caller specify the 727 * the number of buckets. More correct, but less convenient.) 728 */ 729 void distinctSampling(Flag!"generateRandomAll" generateRandomAll, OutputRange) 730 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 731 if (isOutputRange!(OutputRange, char)) 732 { 733 import std.algorithm : splitter; 734 import std.conv : to; 735 import std.digest.murmurhash; 736 import std.math : lrint; 737 import tsv_utils.common.utils : bufferedByLine, InputFieldReordering, 738 InputSourceRange, throwIfWindowsNewlineOnUnix; 739 740 static if (generateRandomAll) assert(cmdopt.genRandomInorder); 741 else assert(!cmdopt.genRandomInorder); 742 743 assert(cmdopt.keyFields.length > 0); 744 assert(0.0 < cmdopt.inclusionProbability && cmdopt.inclusionProbability <= 1.0); 745 746 assert(!cmdopt.inputSources.empty); 747 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 748 749 static if (generateRandomAll) 750 { 751 import std.format : formatValue, singleSpec; 752 immutable randomValueFormatSpec = singleSpec("%d"); 753 } 754 755 immutable ubyte[1] delimArray = [cmdopt.delim]; // For assembling multi-field hash keys. 756 757 uint numBuckets = (1.0 / cmdopt.inclusionProbability).lrint.to!uint; 758 759 /* Create a mapping for the key fields. */ 760 auto keyFieldsReordering = cmdopt.distinctKeyIsFullLine ? null : new InputFieldReordering!char(cmdopt.keyFields); 761 762 /* First header is read during command line argument processing. */ 763 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 764 { 765 auto inputStream = cmdopt.inputSources.front; 766 throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 767 768 static if (generateRandomAll) 769 { 770 outputStream.put(cmdopt.randomValueHeader); 771 outputStream.put(cmdopt.delim); 772 } 773 else if (cmdopt.printRandom) 774 { 775 outputStream.put(cmdopt.randomValueHeader); 776 outputStream.put(cmdopt.delim); 777 } 778 779 outputStream.put(inputStream.header); 780 outputStream.put("\n"); 781 } 782 783 /* Process each line. */ 784 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 785 ulong numLinesWritten = 0; 786 787 foreach (inputStream; cmdopt.inputSources) 788 { 789 if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 790 791 foreach (ulong fileLineNum, line; 792 inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine)) 793 { 794 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum); 795 796 /* Murmurhash works by successively adding individual keys, then finalizing. 797 * Adding individual keys is simpler if the full-line-as-key and individual 798 * fields as keys cases are separated. 799 */ 800 auto hasher = MurmurHash3!32(cmdopt.seed); 801 802 if (cmdopt.distinctKeyIsFullLine) 803 { 804 hasher.put(cast(ubyte[]) line); 805 } 806 else 807 { 808 assert(keyFieldsReordering !is null); 809 810 /* Gather the key field values and assemble the key. */ 811 keyFieldsReordering.initNewLine; 812 foreach (fieldIndex, fieldValue; line.splitter(cmdopt.delim).enumerate) 813 { 814 keyFieldsReordering.processNextField(fieldIndex, fieldValue); 815 if (keyFieldsReordering.allFieldsFilled) break; 816 } 817 818 enforce(keyFieldsReordering.allFieldsFilled, 819 format("Not enough fields in line. File: %s, Line: %s", 820 inputStream.name, fileLineNum)); 821 822 foreach (count, key; keyFieldsReordering.outputFields.enumerate) 823 { 824 if (count > 0) hasher.put(delimArray); 825 hasher.put(cast(ubyte[]) key); 826 } 827 } 828 829 hasher.finish; 830 831 static if (generateRandomAll) 832 { 833 import std.conv : to; 834 outputStream.formatValue(hasher.get % numBuckets, randomValueFormatSpec); 835 outputStream.put(cmdopt.delim); 836 outputStream.put(line); 837 outputStream.put("\n"); 838 839 if (cmdopt.sampleSize != 0) 840 { 841 ++numLinesWritten; 842 if (numLinesWritten == cmdopt.sampleSize) return; 843 } 844 } 845 else if (hasher.get % numBuckets == 0) 846 { 847 if (cmdopt.printRandom) 848 { 849 outputStream.put('0'); 850 outputStream.put(cmdopt.delim); 851 } 852 outputStream.put(line); 853 outputStream.put("\n"); 854 855 if (cmdopt.sampleSize != 0) 856 { 857 ++numLinesWritten; 858 if (numLinesWritten == cmdopt.sampleSize) return; 859 } 860 } 861 } 862 } 863 } 864 865 /** Random sampling command handler. Invokes the appropriate sampling routine based on 866 * the command line arguments. 867 * 868 * Random sampling selects a fixed size random sample from the input stream. Both 869 * simple random sampling (equal likelihood) and weighted random sampling are 870 * supported. Selected lines are output either in random order or original input order. 871 * For weighted sampling the random order is the weighted selection order. 872 * 873 * Two algorithms are used, reservoir sampling via a heap and reservoir sampling via 874 * Algorithm R. This routine selects the appropriate reservoir sampling function and 875 * template instantiation to based on the command line arguments. 876 * 877 * Weighted sampling always uses the heap approach. Compatibility mode does as well, 878 * as it is the method that uses per-line random value assignments. The implication 879 * of compatibility mode is that a larger sample size includes all the results from 880 * a smaller sample, assuming the same random seed is used. 881 * 882 * For unweighted sampling there is a performance tradeoff between implementations. 883 * Heap-based sampling is faster for small sample sizes. Algorithm R is faster for 884 * large sample sizes. The threshold used was chosen based on performance tests. See 885 * the reservoirSamplingAlgorithmR documentation for more information. 886 */ 887 888 void randomSamplingCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 889 if (isOutputRange!(OutputRange, char)) 890 { 891 assert(cmdopt.sampleSize != 0); 892 893 immutable size_t algorithmRSampleSizeThreshold = 128 * 1024; 894 895 if (cmdopt.hasWeightField) 896 { 897 if (cmdopt.preserveInputOrder) 898 { 899 reservoirSamplingViaHeap!(Yes.isWeighted, Yes.preserveInputOrder)(cmdopt, outputStream); 900 } 901 else 902 { 903 reservoirSamplingViaHeap!(Yes.isWeighted, No.preserveInputOrder)(cmdopt, outputStream); 904 } 905 } 906 else if (cmdopt.compatibilityMode || 907 (cmdopt.sampleSize < algorithmRSampleSizeThreshold && !cmdopt.preferAlgorithmR)) 908 { 909 if (cmdopt.preserveInputOrder) 910 { 911 reservoirSamplingViaHeap!(No.isWeighted, Yes.preserveInputOrder)(cmdopt, outputStream); 912 } 913 else 914 { 915 reservoirSamplingViaHeap!(No.isWeighted, No.preserveInputOrder)(cmdopt, outputStream); 916 } 917 } 918 else if (cmdopt.preserveInputOrder) 919 { 920 reservoirSamplingAlgorithmR!(Yes.preserveInputOrder)(cmdopt, outputStream); 921 } 922 else 923 { 924 reservoirSamplingAlgorithmR!(No.preserveInputOrder)(cmdopt, outputStream); 925 } 926 } 927 928 /** Reservoir sampling using a heap. Both weighted and unweighted random sampling are 929 * supported. 930 * 931 * The algorithm used here is based on the one-pass algorithm described by Pavlos 932 * Efraimidis and Paul Spirakis ("Weighted Random Sampling over Data Streams", Pavlos S. 933 * Efraimidis, https://arxiv.org/abs/1012.0256). In the unweighted case weights are 934 * simply set to one. 935 * 936 * The implementation uses a heap (priority queue) large enough to hold the desired 937 * number of lines. Input is read line-by-line, assigned a random value, and added to 938 * the heap. The role of the heap is to identify the lines with the highest assigned 939 * random values. Once the heap is full, adding a new line means dropping the line with 940 * the lowest score. A "min" heap used for this reason. 941 * 942 * When done reading all lines, the "min" heap is in reverse of weighted selection 943 * order. Weighted selection order is obtained by removing each element one at at time 944 * from the heap. The underlying data store will have the elements in weighted selection 945 * order (largest weights first). 946 * 947 * Generating output in weighted order is useful for several reasons: 948 * - For weighted sampling, it preserves the property that smaller valid subsets can be 949 * created by taking the first N lines. 950 * - For unweighted sampling, it ensures that all output permutations are possible, and 951 * are not influenced by input order or the heap data structure used. 952 * - Order consistency is maintained when making repeated use of the same random seed, 953 * but with different sample sizes. 954 * 955 * The other choice is preserving input order. This is supporting by recording line 956 * numbers and sorting the selected sample. 957 * 958 * There are use cases where only the selection set matters. For these some performance 959 * could be gained by skipping the reordering and simply printing the backing store 960 * array in-order. Performance tests indicate only a minor benefit, so this is not 961 * supported. 962 * 963 * Notes: 964 * $(LIST 965 * * In tsv-sample versions 1.2.1 and earlier this routine also supported 966 * randomization of all input lines. This was dropped in version 1.2.2 in favor 967 * of the approach used in randomizeLines. The latter has significant advantages 968 * given that all data must be read into memory. 969 * * For large reservoir sizes better performance can be achieved using Algorithm R. 970 * See the reservoirSamplingAlgorithmR documentation for details. 971 * ) 972 */ 973 void reservoirSamplingViaHeap(Flag!"isWeighted" isWeighted, Flag!"preserveInputOrder" preserveInputOrder, OutputRange) 974 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 975 if (isOutputRange!(OutputRange, char)) 976 { 977 import std.algorithm : sort; 978 import std.container.array; 979 import std.container.binaryheap; 980 import std.meta : AliasSeq; 981 import std.random : Random = Mt19937, uniform01; 982 import tsv_utils.common.utils : bufferedByLine, InputSourceRange, throwIfWindowsNewlineOnUnix; 983 984 static if (isWeighted) assert(cmdopt.hasWeightField); 985 else assert(!cmdopt.hasWeightField); 986 987 assert(cmdopt.sampleSize > 0); 988 989 assert(!cmdopt.inputSources.empty); 990 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 991 992 auto randomGenerator = Random(cmdopt.seed); 993 994 static struct Entry(Flag!"preserveInputOrder" preserveInputOrder) 995 { 996 double score; 997 const(char)[] line; 998 static if (preserveInputOrder) ulong lineNumber; 999 } 1000 1001 /* Create the heap and backing data store. 1002 * 1003 * Note: An std.container.array is used as the backing store to avoid some issues in 1004 * the standard library (Phobos) binaryheap implementation. Specifically, when an 1005 * std.container.array is used as backing store, the heap can efficiently reversed by 1006 * removing the heap elements. This leaves the backing store in the reversed order. 1007 * However, the current binaryheap implementation does not support this for all 1008 * backing stores. See: https://issues.dlang.org/show_bug.cgi?id=17094. 1009 */ 1010 1011 Array!(Entry!preserveInputOrder) dataStore; 1012 dataStore.reserve(cmdopt.sampleSize); 1013 auto reservoir = dataStore.heapify!("a.score > b.score")(0); // Min binaryheap 1014 1015 /* First header is read during command line argument processing. */ 1016 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 1017 { 1018 auto inputStream = cmdopt.inputSources.front; 1019 throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 1020 1021 if (cmdopt.printRandom) 1022 { 1023 outputStream.put(cmdopt.randomValueHeader); 1024 outputStream.put(cmdopt.delim); 1025 } 1026 outputStream.put(inputStream.header); 1027 outputStream.put("\n"); 1028 } 1029 1030 /* Process each line. */ 1031 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 1032 static if (preserveInputOrder) ulong totalLineNum = 0; 1033 1034 foreach (inputStream; cmdopt.inputSources) 1035 { 1036 if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 1037 1038 foreach (ulong fileLineNum, line; 1039 inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine)) 1040 { 1041 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum); 1042 1043 static if (!isWeighted) 1044 { 1045 immutable double lineScore = uniform01(randomGenerator); 1046 } 1047 else 1048 { 1049 immutable double lineWeight = 1050 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, inputStream.name, fileLineNum); 1051 immutable double lineScore = 1052 (lineWeight > 0.0) 1053 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 1054 : 0.0; 1055 } 1056 1057 static if (preserveInputOrder) alias entryCTArgs = AliasSeq!(totalLineNum); 1058 else alias entryCTArgs = AliasSeq!(); 1059 1060 if (reservoir.length < cmdopt.sampleSize) 1061 { 1062 reservoir.insert(Entry!preserveInputOrder(lineScore, line.dup, entryCTArgs)); 1063 } 1064 else if (reservoir.front.score < lineScore) 1065 { 1066 reservoir.replaceFront(Entry!preserveInputOrder(lineScore, line.dup, entryCTArgs)); 1067 } 1068 1069 static if (preserveInputOrder) ++totalLineNum; 1070 } 1071 } 1072 1073 /* Done with input, all entries are in the reservoir. */ 1074 1075 /* The asserts here avoid issues with the current binaryheap implementation. They 1076 * detect use of backing stores having a length not synchronized to the reservoir. 1077 */ 1078 immutable ulong numLines = reservoir.length; 1079 assert(numLines == dataStore.length); 1080 1081 /* Update the backing store so it is in the desired output order. 1082 */ 1083 static if (preserveInputOrder) 1084 { 1085 dataStore[].sort!((a, b) => a.lineNumber < b.lineNumber); 1086 } 1087 else 1088 { 1089 /* Output in weighted selection order. The heap is in reverse order of assigned 1090 * weights. Reversing order is done by removing all elements from the heap. This 1091 * leaves the backing store in the correct order. 1092 */ 1093 while (!reservoir.empty) reservoir.removeFront; 1094 } 1095 1096 assert(numLines == dataStore.length); 1097 1098 foreach (entry; dataStore) 1099 { 1100 if (cmdopt.printRandom) 1101 { 1102 outputStream.formatRandomValue(entry.score); 1103 outputStream.put(cmdopt.delim); 1104 } 1105 outputStream.put(entry.line); 1106 outputStream.put("\n"); 1107 } 1108 } 1109 1110 /** Generate weighted random values for all input lines, preserving input order. 1111 * 1112 * This complements weighted reservoir sampling, but instead of using a reservoir it 1113 * simply iterates over the input lines generating the values. The weighted random 1114 * values are generated with the same formula used by reservoirSampling. 1115 */ 1116 void generateWeightedRandomValuesInorder(OutputRange) 1117 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1118 if (isOutputRange!(OutputRange, char)) 1119 { 1120 import std.random : Random = Mt19937, uniform01; 1121 import tsv_utils.common.utils : bufferedByLine, InputSourceRange, throwIfWindowsNewlineOnUnix; 1122 1123 assert(cmdopt.hasWeightField); 1124 1125 assert(!cmdopt.inputSources.empty); 1126 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 1127 1128 auto randomGenerator = Random(cmdopt.seed); 1129 1130 /* First header is read during command line argument processing. */ 1131 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 1132 { 1133 auto inputStream = cmdopt.inputSources.front; 1134 throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 1135 1136 outputStream.put(cmdopt.randomValueHeader); 1137 outputStream.put(cmdopt.delim); 1138 outputStream.put(inputStream.header); 1139 outputStream.put("\n"); 1140 } 1141 1142 /* Process each line. */ 1143 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 1144 ulong numLinesWritten = 0; 1145 1146 foreach (inputStream; cmdopt.inputSources) 1147 { 1148 if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 1149 1150 foreach (ulong fileLineNum, line; 1151 inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine)) 1152 { 1153 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum); 1154 1155 immutable double lineWeight = 1156 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, inputStream.name, fileLineNum); 1157 1158 immutable double lineScore = 1159 (lineWeight > 0.0) 1160 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 1161 : 0.0; 1162 1163 outputStream.formatRandomValue(lineScore); 1164 outputStream.put(cmdopt.delim); 1165 outputStream.put(line); 1166 outputStream.put("\n"); 1167 1168 if (cmdopt.sampleSize != 0) 1169 { 1170 ++numLinesWritten; 1171 if (numLinesWritten == cmdopt.sampleSize) return; 1172 } 1173 } 1174 } 1175 } 1176 1177 /** Reservoir sampling via Algorithm R 1178 * 1179 * This is an implementation of reservoir sampling using what is commonly known as 1180 * "Algorithm R", credited to Alan Waterman by Donald Knuth in the "The Art of 1181 * Computer Programming, Volume 2: Seminumerical Algorithms". More information about 1182 * the algorithm can be found in Jeffrey Vitter's classic paper "Random Sampling with 1183 * a Reservoir" (1985) as well as the Wikipedia article "Reservoir Sampling" 1184 * (https://en.wikipedia.org/wiki/Reservoir_sampling#Algorithm_R). 1185 * 1186 * Algorithm R is used for unweighted sampling without replacement. The heap-based 1187 * algorithm in reservoirSamplingViaHeap is used for weighted sampling. 1188 * 1189 * The classic algorithm stops after identifying the selected set of items. This 1190 * implementation goes one step further and randomizes the order of the selected 1191 * lines. This is consistent with shuffling (line order randomization), a primary 1192 * tsv-sample use-case. 1193 * 1194 * This algorithm is faster than reservoirSamplingViaHeap when the sample size 1195 * (reservoir size) is large. Heap insertion is O(log k), where k is the sample size. 1196 * Insertion in this algorithm is O(1). Similarly, generating the random order in the 1197 * heap is O(k * log k), while in this algorithm the final randomization step is O(k). 1198 * 1199 * This speed advantage may be offset a certain amount by using a more expensive random 1200 * value generator. reservoirSamplingViaHeap generates values between zero and one, 1201 * whereas reservoirSamplingAlgorithmR generates random integers over and ever growing 1202 * interval. The latter is expected to be more expensive. This is consistent with 1203 * performance tests indicating that reservoirSamplingViaHeap is faster when using 1204 * small-to-medium size reservoirs and large input streams. 1205 */ 1206 void reservoirSamplingAlgorithmR(Flag!"preserveInputOrder" preserveInputOrder, OutputRange) 1207 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1208 if (isOutputRange!(OutputRange, char)) 1209 { 1210 import std.meta : AliasSeq; 1211 import std.random : Random = Mt19937, randomShuffle, uniform; 1212 import std.algorithm : sort; 1213 import tsv_utils.common.utils : bufferedByLine, InputSourceRange, throwIfWindowsNewlineOnUnix; 1214 1215 assert(cmdopt.sampleSize > 0); 1216 assert(!cmdopt.hasWeightField); 1217 assert(!cmdopt.compatibilityMode); 1218 assert(!cmdopt.printRandom); 1219 assert(!cmdopt.genRandomInorder); 1220 1221 assert(!cmdopt.inputSources.empty); 1222 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 1223 1224 static struct Entry(Flag!"preserveInputOrder" preserveInputOrder) 1225 { 1226 const(char)[] line; 1227 static if (preserveInputOrder) ulong lineNumber; 1228 } 1229 1230 Entry!preserveInputOrder[] reservoir; 1231 auto reservoirAppender = appender(&reservoir); 1232 reservoirAppender.reserve(cmdopt.sampleSize); 1233 1234 auto randomGenerator = Random(cmdopt.seed); 1235 1236 /* First header is read during command line argument processing. */ 1237 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 1238 { 1239 auto inputStream = cmdopt.inputSources.front; 1240 throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 1241 1242 outputStream.put(inputStream.header); 1243 outputStream.put("\n"); 1244 } 1245 1246 /* Process each line. */ 1247 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 2 : 1; 1248 ulong totalLineNum = 0; 1249 1250 foreach (inputStream; cmdopt.inputSources) 1251 { 1252 if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 1253 1254 foreach (ulong fileLineNum, line; 1255 inputStream.file.bufferedByLine!(KeepTerminator.no).enumerate(fileBodyStartLine)) 1256 { 1257 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, inputStream.name, fileLineNum); 1258 1259 /* Add lines to the reservoir until the reservoir is filled. 1260 * After that lines are added with decreasing likelihood, based on 1261 * the total number of lines seen. If added to the reservoir, the 1262 * line replaces a randomly chosen existing line. 1263 */ 1264 static if (preserveInputOrder) alias entryCTArgs = AliasSeq!(totalLineNum); 1265 else alias entryCTArgs = AliasSeq!(); 1266 1267 if (totalLineNum < cmdopt.sampleSize) 1268 { 1269 reservoirAppender ~= Entry!preserveInputOrder(line.idup, entryCTArgs); 1270 } 1271 else 1272 { 1273 immutable size_t i = uniform(0, totalLineNum, randomGenerator); 1274 if (i < reservoir.length) 1275 { 1276 reservoir[i] = Entry!preserveInputOrder(line.idup, entryCTArgs); 1277 } 1278 } 1279 1280 ++totalLineNum; 1281 } 1282 } 1283 1284 /* Done with input. The sample is in the reservoir. Update the order and print. */ 1285 1286 static if (preserveInputOrder) 1287 { 1288 reservoir.sort!((a, b) => a.lineNumber < b.lineNumber); 1289 } 1290 else 1291 { 1292 reservoir.randomShuffle(randomGenerator); 1293 } 1294 1295 foreach (ref entry; reservoir) 1296 { 1297 outputStream.put(entry.line); 1298 outputStream.put("\n"); 1299 } 1300 } 1301 1302 /** Shuffling command handler. Invokes the appropriate shuffle (line order 1303 * randomization) routine based on the command line arguments. 1304 * 1305 * Shuffling has similarities to random sampling, but the algorithms used are 1306 * different. Random sampling selects a subset, only the current subset selection 1307 * needs to be kept in memory. This is supported by reservoir sampling. By contrast, 1308 * shuffling needs to hold all input in memory, so it works better to read all lines 1309 * into memory at once and then shuffle. 1310 * 1311 * Two different algorithms are used. Array shuffling is used for unweighted shuffling. 1312 * Sorting plus random weight assignments is used for weighted shuffling and when 1313 * compatibility mode is being used. 1314 * 1315 * The algorithms used here are all limited by available memory. 1316 */ 1317 void shuffleCommand(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1318 if (isOutputRange!(OutputRange, char)) 1319 { 1320 if (cmdopt.hasWeightField) 1321 { 1322 randomizeLinesViaSort!(Yes.isWeighted)(cmdopt, outputStream); 1323 } 1324 else if (cmdopt.compatibilityMode) 1325 { 1326 randomizeLinesViaSort!(No.isWeighted)(cmdopt, outputStream); 1327 } 1328 else 1329 { 1330 randomizeLinesViaShuffle(cmdopt, outputStream); 1331 } 1332 } 1333 1334 /** Shuffle all input lines by assigning random weights and sorting. 1335 * 1336 * randomizeLinesViaSort reads in all input lines and writes them out in random order. 1337 * The algorithm works by assigning a random value to each line and sorting. Both 1338 * weighted and unweighted shuffling are supported. 1339 * 1340 * Notes: 1341 * $(LIST 1342 * * For unweighted shuffling randomizeLinesViaShuffle is faster and should be used 1343 * unless compatibility mode is needed. 1344 * * This routine is significantly faster than heap-based reservoir sampling in the 1345 * case where the entire file is being read. 1346 * * Input data must be read entirely in memory. Disk oriented techniques are needed 1347 * when data sizes get too large for available memory. One option is to generate 1348 * random values for each line, e.g. --gen-random-inorder, and sort with a disk- 1349 * backed sort program like GNU sort. 1350 * ) 1351 */ 1352 void randomizeLinesViaSort(Flag!"isWeighted" isWeighted, OutputRange) 1353 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1354 if (isOutputRange!(OutputRange, char)) 1355 { 1356 import std.algorithm : map, sort; 1357 1358 static if (isWeighted) assert(cmdopt.hasWeightField); 1359 else assert(!cmdopt.hasWeightField); 1360 1361 assert(cmdopt.sampleSize == 0); 1362 1363 /* 1364 * Read all file data into memory. Then split the data into lines and assign a 1365 * random value to each line. readFileData also writes the first header line. 1366 */ 1367 const fileData = readFileData!(Yes.hasRandomValue)(cmdopt, outputStream); 1368 auto inputLines = fileData.identifyInputLines!(Yes.hasRandomValue, isWeighted)(cmdopt); 1369 1370 /* 1371 * Sort by the weight and output the lines. 1372 */ 1373 inputLines.sort!((a, b) => a.randomValue > b.randomValue); 1374 1375 foreach (lineEntry; inputLines) 1376 { 1377 if (cmdopt.printRandom) 1378 { 1379 outputStream.formatRandomValue(lineEntry.randomValue); 1380 outputStream.put(cmdopt.delim); 1381 } 1382 outputStream.put(lineEntry.data); 1383 outputStream.put("\n"); 1384 } 1385 } 1386 1387 /** Shuffle (randomize) all input lines using a shuffling algorithm. 1388 * 1389 * All lines in files and/or standard input are read in and written out in random 1390 * order. This routine uses array shuffling, which is faster than sorting. It is a 1391 * good alternative to randomizeLinesViaSort when doing unweighted shuffling (the 1392 * most common case). 1393 * 1394 * Input data size is limited by available memory. Disk oriented techniques are needed 1395 * when data sizes are larger. For example, generating random values line-by-line (ala 1396 * --gen-random-inorder) and sorting with a disk-backed sort program like GNU sort. 1397 * 1398 * This routine does not support random value printing or compatibility-mode. 1399 */ 1400 void randomizeLinesViaShuffle(OutputRange)(ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1401 if (isOutputRange!(OutputRange, char)) 1402 { 1403 import std.algorithm : map; 1404 import std.random : Random = Mt19937, randomShuffle; 1405 1406 assert(cmdopt.sampleSize == 0); 1407 assert(!cmdopt.hasWeightField); 1408 assert(!cmdopt.printRandom); 1409 assert(!cmdopt.genRandomInorder); 1410 1411 /* 1412 * Read all file data into memory and split into lines. 1413 */ 1414 const fileData = readFileData!(No.hasRandomValue)(cmdopt, outputStream); 1415 auto inputLines = fileData.identifyInputLines!(No.hasRandomValue, No.isWeighted)(cmdopt); 1416 1417 /* 1418 * Randomly shuffle and print each line. 1419 * 1420 * Note: Also tried randomCover, but that was exceedingly slow. 1421 */ 1422 import std.random : randomShuffle; 1423 1424 auto randomGenerator = Random(cmdopt.seed); 1425 inputLines.randomShuffle(randomGenerator); 1426 1427 foreach (ref line; inputLines) 1428 { 1429 outputStream.put(line.data); 1430 outputStream.put("\n"); 1431 } 1432 } 1433 1434 /** Simple random sampling with replacement. 1435 * 1436 * All lines in files and/or standard input are read in. Then random lines are selected 1437 * one at a time and output. Lines can be selected multiple times. This process continues 1438 * until the desired number of samples (--n|num) has been output. Output continues 1439 * indefinitely if a sample size was not provided. 1440 */ 1441 void simpleRandomSamplingWithReplacement(OutputRange) 1442 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1443 if (isOutputRange!(OutputRange, char)) 1444 { 1445 import std.algorithm : map; 1446 import std.random : Random = Mt19937, uniform; 1447 1448 /* 1449 * Read all file data into memory and split the data into lines. 1450 */ 1451 const fileData = readFileData!(No.hasRandomValue)(cmdopt, outputStream); 1452 const inputLines = fileData.identifyInputLines!(No.hasRandomValue, No.isWeighted)(cmdopt); 1453 1454 if (inputLines.length > 0) 1455 { 1456 auto randomGenerator = Random(cmdopt.seed); 1457 1458 /* Repeat forever is sampleSize is zero, otherwise print sampleSize lines. */ 1459 size_t numLeft = (cmdopt.sampleSize == 0) ? 1 : cmdopt.sampleSize; 1460 while (numLeft != 0) 1461 { 1462 immutable size_t index = uniform(0, inputLines.length, randomGenerator); 1463 outputStream.put(inputLines[index].data); 1464 outputStream.put("\n"); 1465 if (cmdopt.sampleSize != 0) numLeft--; 1466 } 1467 } 1468 } 1469 1470 /** A container holding data read from a file or standard input. 1471 * 1472 * The InputBlock struct is used to represent a block of data read from a file or 1473 * standard input. An array of InputBlocks is returned by readFileData. Typically one 1474 * block per file. Multiple blocks are used for standard input and when the file size 1475 * cannot be determined. Individual lines are not allowed to span blocks. The blocks 1476 * allocated to an individual file are numbered starting with zero. 1477 * 1478 * See readFileData() for more information. 1479 */ 1480 static struct InputBlock 1481 { 1482 string filename; /// Original filename or path. "-" denotes standard input. 1483 size_t fileBlockNumber; /// Zero-based block number for the file. 1484 char[] data; /// The actual data. Newline terminated or last block for the file. 1485 } 1486 1487 /** Read data from one or more files. This routine is used by algorithms needing to 1488 * read all data into memory. 1489 * 1490 * readFileData reads in all data from a set of files. Data is returned as an array 1491 * of InputBlock structs. Normally one InputBlock per file, sized to match the size 1492 * of the file. Standard input is read in one or more blocks, as are files whose size 1493 * cannot be determined. Multiple blocks are used in these last two cases to avoid 1494 * expensive memory reallocations. This is not necessary when file size is known as 1495 * the necessary memory can be preallocated. 1496 * 1497 * Individual lines never span multiple blocks, and newlines are preserved. This 1498 * means that each block starts at the beginning of a line and ends with a newline 1499 * unless the end of a file has been reached. 1500 * 1501 * Each file gets its own block. Prior to using InputSourceRange this was so header 1502 * processing can be done. With InputSourceRange the header is read separately, so 1503 * this could be changed. 1504 */ 1505 InputBlock[] readFileData(HasRandomValue hasRandomValue, OutputRange) 1506 (ref TsvSampleOptions cmdopt, auto ref OutputRange outputStream) 1507 if (isOutputRange!(OutputRange, char)) 1508 { 1509 import std.algorithm : find, min; 1510 import std.range : retro; 1511 import tsv_utils.common.utils : InputSourceRange, throwIfWindowsNewlineOnUnix; 1512 1513 static if(!hasRandomValue) assert(!cmdopt.printRandom); 1514 1515 assert(!cmdopt.inputSources.empty); 1516 static assert(is(typeof(cmdopt.inputSources) == InputSourceRange)); 1517 1518 /* First header is read during command line argument processing. */ 1519 if (cmdopt.hasHeader && !cmdopt.inputSources.front.isHeaderEmpty) 1520 { 1521 auto inputStream = cmdopt.inputSources.front; 1522 throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 1523 1524 if (cmdopt.printRandom) 1525 { 1526 outputStream.put(cmdopt.randomValueHeader); 1527 outputStream.put(cmdopt.delim); 1528 } 1529 outputStream.put(inputStream.header); 1530 outputStream.put("\n"); 1531 } 1532 1533 enum BlockSize = 1024L * 1024L * 1024L; // 1 GB. ('L' notation avoids overflow w/ 2GB+ sizes.) 1534 enum ReadSize = 1024L * 128L; 1535 enum NewlineSearchSize = 1024L * 16L; 1536 1537 InputBlock[] blocks; 1538 auto blocksAppender = appender(&blocks); 1539 blocksAppender.reserve(cmdopt.inputSources.length); // At least one block per file. 1540 1541 ubyte[] rawReadBuffer = new ubyte[ReadSize]; 1542 1543 foreach (inputStream; cmdopt.inputSources) 1544 { 1545 if (cmdopt.hasHeader) throwIfWindowsNewlineOnUnix(inputStream.header, inputStream.name, 1); 1546 1547 /* If the file size can be determined then read it as a single block. 1548 * Otherwise read as multiple blocks. File.size() returns ulong.max 1549 * if file size cannot be determined, so we'll combine that check 1550 * with the standard input case. 1551 */ 1552 1553 immutable ulong filesize = inputStream.isStdin ? ulong.max : inputStream.file.size; 1554 auto ifile = inputStream.file; 1555 1556 if (filesize != ulong.max) 1557 { 1558 readFileDataAsOneBlock(inputStream.name, ifile, filesize, 1559 blocksAppender, rawReadBuffer); 1560 } 1561 else 1562 { 1563 readFileDataAsMultipleBlocks( 1564 inputStream.name, ifile, blocksAppender, rawReadBuffer, 1565 BlockSize, NewlineSearchSize); 1566 } 1567 } 1568 return blocks; 1569 } 1570 1571 /* readFileData() helper function. Read data from a File handle as a single block. The 1572 * new block is appended to an existing InputBlock[] array. 1573 * 1574 * readFileDataAsOneBlocks is part of the readFileData logic. It handles the case 1575 * where a file is being read as a single block. Normally initialBlockSize is passed 1576 * as the size of the file. 1577 * 1578 * This routine has been separated out to enable unit testing. At present it is not 1579 * intended as a general API. See readFileData for more info. 1580 */ 1581 private void readFileDataAsOneBlock( 1582 string filename, 1583 ref File ifile, 1584 const ulong initialBlockSize, 1585 ref RefAppender!(InputBlock[]) blocksAppender, 1586 ref ubyte[] rawReadBuffer) 1587 { 1588 blocksAppender.put(InputBlock(filename, 0)); 1589 auto dataAppender = appender(&(blocksAppender.data[$-1].data)); 1590 dataAppender.reserve(initialBlockSize); 1591 1592 foreach (ref ubyte[] buffer; ifile.byChunk(rawReadBuffer)) 1593 { 1594 dataAppender.put(cast(char[]) buffer); 1595 } 1596 } 1597 1598 /* readFileData() helper function. Read data from a File handle as one or more blocks. 1599 * Blocks are appended to an existing InputBlock[] array. 1600 * 1601 * readFileDataAsMultipleBlocks is part of the readFileData logic. It handles the case 1602 * where a file or standard input is being read as a series of blocks. This is the 1603 * standard approach for standard input, but also applies when the file size cannot be 1604 * determined. 1605 * 1606 * This routine has been separated out to enable unit testing. At present it is not 1607 * intended as a general API. See readFileData for more info. 1608 */ 1609 private void readFileDataAsMultipleBlocks( 1610 string filename, 1611 ref File ifile, 1612 ref RefAppender!(InputBlock[]) blocksAppender, 1613 ref ubyte[] rawReadBuffer, 1614 const size_t blockSize, 1615 const size_t newlineSearchSize) 1616 { 1617 import std.algorithm : find, min; 1618 import std.range : retro; 1619 1620 assert(ifile.isOpen); 1621 1622 /* Create a new block for the file and an Appender for writing data. 1623 */ 1624 blocksAppender.put(InputBlock(filename, 0)); 1625 auto dataAppender = appender(&(blocksAppender.data[$-1].data)); 1626 dataAppender.reserve(blockSize); 1627 size_t blockNumber = 0; 1628 1629 /* Read all the data and copy it to an InputBlock. */ 1630 foreach (ref ubyte[] buffer; ifile.byChunk(rawReadBuffer)) 1631 { 1632 assert(blockNumber == blocksAppender.data[$-1].fileBlockNumber); 1633 1634 immutable size_t remainingCapacity = dataAppender.capacity - dataAppender.data.length; 1635 1636 if (buffer.length <= remainingCapacity) 1637 { 1638 dataAppender.put(cast(char[]) buffer); 1639 } 1640 else 1641 { 1642 /* Look for the last newline in the input buffer that fits in remaining 1643 * capacity of the block. 1644 */ 1645 auto searchRegion = buffer[0 .. remainingCapacity]; 1646 auto appendRegion = searchRegion.retro.find('\n').source; 1647 1648 if (appendRegion.length > 0) 1649 { 1650 /* Copy the first part of the read buffer to the block. */ 1651 dataAppender.put(cast(char[]) appendRegion); 1652 1653 /* Create a new InputBlock and copy the remaining data to it. */ 1654 blockNumber++; 1655 blocksAppender.put(InputBlock(filename, blockNumber)); 1656 dataAppender = appender(&(blocksAppender.data[$-1].data)); 1657 dataAppender.reserve(blockSize); 1658 dataAppender.put(cast(char[]) buffer[appendRegion.length .. $]); 1659 1660 assert(blocksAppender.data.length >= 2); 1661 assert(blocksAppender.data[$-2].data[$-1] == '\n'); 1662 } 1663 else 1664 { 1665 /* Search backward in the current block for a newline. If found, it 1666 * becomes the last newline in the current block. Anything following 1667 * it is moved to the block. If a newline is not found, simply append 1668 * to the current block and let it grow. We'll only search backward 1669 * so far. 1670 */ 1671 immutable size_t currBlockLength = blocksAppender.data[$-1].data.length; 1672 immutable size_t searchLength = min(currBlockLength, newlineSearchSize); 1673 immutable size_t searchStart = currBlockLength - searchLength; 1674 auto blockSearchRegion = blocksAppender.data[$-1].data[searchStart .. $]; 1675 auto lastNewlineOffset = blockSearchRegion.retro.find('\n').source.length; 1676 1677 if (lastNewlineOffset != 0) 1678 { 1679 /* Create a new InputBlock. The previous InputBlock is then found 1680 * at blocksAppender.data[$-2]. It may be a physically different 1681 * struct (a copy) if the blocks array gets reallocated. 1682 */ 1683 blockNumber++; 1684 blocksAppender.put(InputBlock(filename, blockNumber)); 1685 dataAppender = appender(&(blocksAppender.data[$-1].data)); 1686 dataAppender.reserve(blockSize); 1687 1688 /* Copy data following the newline from the last block to the new 1689 * block. Then append the current read buffer. 1690 */ 1691 immutable size_t moveRegionStart = searchStart + lastNewlineOffset; 1692 dataAppender.put(blocksAppender.data[$-2].data[moveRegionStart .. $]); 1693 dataAppender.put(cast(char[]) buffer); 1694 1695 /* Now delete the moved region from the last block. */ 1696 blocksAppender.data[$-2].data.length = moveRegionStart; 1697 1698 assert(blocksAppender.data.length >= 2); 1699 assert(blocksAppender.data[$-2].data[$-1] == '\n'); 1700 } 1701 else 1702 { 1703 /* Give up. Allow the current block to grow. */ 1704 dataAppender.put(cast(char[]) buffer); 1705 } 1706 } 1707 } 1708 } 1709 } 1710 1711 /** HasRandomValue is a boolean flag used at compile time by identifyInputLines to 1712 * distinguish use cases needing random value assignments from those that don't. 1713 */ 1714 alias HasRandomValue = Flag!"hasRandomValue"; 1715 1716 /** An InputLine array is returned by identifyInputLines to represent each non-header line 1717 * line found in a FileData array. The 'data' element contains the line. A 'randomValue' 1718 * line is included if random values are being generated. 1719 */ 1720 static struct InputLine(HasRandomValue hasRandomValue) 1721 { 1722 const(char)[] data; 1723 static if (hasRandomValue) double randomValue; 1724 } 1725 1726 /** identifyInputLines is used by algorithms that read all files into memory prior to 1727 * processing. It does the initial processing of the file data. 1728 * 1729 * Two main tasks are performed. One is splitting all input data into lines. The second 1730 * is assigning a random value to the line, if random values are being generated. 1731 * 1732 * The key input is an InputBlock array. Normally one block for each file, but standard 1733 * input may have multiple blocks. 1734 * 1735 * The return value is an array of InputLine structs. The struct will have a 'randomValue' 1736 * member if random values are being assigned. 1737 */ 1738 InputLine!hasRandomValue[] identifyInputLines(HasRandomValue hasRandomValue, Flag!"isWeighted" isWeighted) 1739 (const ref InputBlock[] inputBlocks, ref TsvSampleOptions cmdopt) 1740 { 1741 import std.algorithm : splitter; 1742 import std.array : appender; 1743 import std.random : Random = Mt19937, uniform01; 1744 import tsv_utils.common.utils : throwIfWindowsNewlineOnUnix; 1745 1746 static assert(hasRandomValue || !isWeighted); 1747 static if(!hasRandomValue) assert(!cmdopt.printRandom); 1748 1749 InputLine!hasRandomValue[] inputLines; 1750 1751 auto linesAppender = appender(&inputLines); 1752 static if (hasRandomValue) auto randomGenerator = Random(cmdopt.seed); 1753 1754 /* Note: fileLineNum is zero-based here. One-based in most other code in this file. */ 1755 immutable size_t fileBodyStartLine = cmdopt.hasHeader ? 1 : 0; 1756 size_t fileLineNum = fileBodyStartLine; 1757 1758 foreach (block; inputBlocks) 1759 { 1760 /* Drop the last newline to avoid adding an extra empty line. */ 1761 const data = (block.data.length > 0 && block.data[$-1] == '\n') ? 1762 block.data[0 .. $-1] : block.data; 1763 1764 if (block.fileBlockNumber == 0) fileLineNum = fileBodyStartLine; 1765 1766 foreach (ref line; data.splitter('\n')) 1767 { 1768 fileLineNum++; 1769 1770 if (fileLineNum == 1) throwIfWindowsNewlineOnUnix(line, block.filename, fileLineNum); 1771 1772 static if (!hasRandomValue) 1773 { 1774 linesAppender.put(InputLine!hasRandomValue(line)); 1775 } 1776 else 1777 { 1778 static if (!isWeighted) 1779 { 1780 immutable double randomValue = uniform01(randomGenerator); 1781 } 1782 else 1783 { 1784 immutable double lineWeight = 1785 getFieldValue!double(line, cmdopt.weightField, cmdopt.delim, 1786 block.filename, fileLineNum); 1787 immutable double randomValue = 1788 (lineWeight > 0.0) 1789 ? uniform01(randomGenerator) ^^ (1.0 / lineWeight) 1790 : 0.0; 1791 } 1792 1793 linesAppender.put(InputLine!hasRandomValue(line, randomValue)); 1794 } 1795 } 1796 } 1797 1798 return inputLines; 1799 } 1800 1801 1802 /* Unit tests for ReadFileData. These tests focus on multiple InputBlock scenarios. 1803 * Other use paths are well tested by the tests at the end cases. 1804 */ 1805 unittest 1806 { 1807 import tsv_utils.common.unittest_utils; 1808 import std.algorithm : equal, find, joiner, splitter; 1809 import std.array : appender; 1810 import std.file : rmdirRecurse; 1811 import std.path : buildPath; 1812 import std.range : repeat; 1813 1814 auto rfdTestDir = makeUnittestTempDir("tsv_sample_readFileData"); 1815 scope(exit) rfdTestDir.rmdirRecurse; 1816 1817 char[] file1Data; 1818 char[] file2Data; 1819 char[] file3Data; 1820 1821 auto app1 = appender(&file1Data); 1822 auto app2 = appender(&file2Data); 1823 auto app3 = appender(&file3Data); 1824 1825 /* File 1: 1000 short lines. */ 1826 app1.put("\n".repeat(100).joiner); 1827 app1.put("x\n".repeat(100).joiner); 1828 app1.put("yz\n".repeat(100).joiner); 1829 app1.put("pqr\n".repeat(100).joiner); 1830 app1.put("a\nbc\ndef\n".repeat(100).joiner); 1831 app1.put('\n'.repeat(100)); 1832 app1.put("z\n".repeat(100).joiner); 1833 app1.put("xy\n".repeat(100).joiner); 1834 1835 /* File 2: 500 longer lines. */ 1836 app2.put( 1837 "0123456789-abcdefghijklmnopqrstuvwxyz-0123456789abcdefghijklmnopqrstuvwxyz-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ-\n" 1838 .repeat(100) 1839 .joiner); 1840 app2.put( 1841 "|abcdefghijklmnopqrstuv|\n|0123456789|\n|0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ|\n|abcdefghijklmnopqrstuvwxyz|\n" 1842 .repeat(100) 1843 .joiner); 1844 app2.put( 1845 "0123456789-abcdefghijklmnopqrstuvwxyz-0123456789abcdefghijklmnopqrstuvwxyz-0123456789ABCDEFGHIJKLMNOPQRSTUVWXYZ-\n" 1846 .repeat(100) 1847 .joiner); 1848 1849 /* File 3: 1000 mixed length lines. */ 1850 app3.put("\n\n|abcde|\n1\n12\n123\n|abcdefghijklmnop|\n|xyz|\n0123456789\nX\n".repeat(100).joiner); 1851 1852 string file1Path = buildPath(rfdTestDir, "file1.txt"); 1853 string file2Path = buildPath(rfdTestDir, "file2.txt"); 1854 string file3Path = buildPath(rfdTestDir, "file3.txt"); 1855 1856 try 1857 { 1858 auto ofile1 = File(file1Path, "w"); 1859 ofile1.write(file1Data); 1860 } 1861 catch (Exception e) assert(false, format("Failed to write file: %s.\n Error: %s", file1Path, e.msg)); 1862 1863 try 1864 { 1865 auto ofile2 = File(file2Path, "w"); 1866 ofile2.write(file2Data); 1867 } 1868 catch (Exception e) assert(false, format("Failed to write file: %s.\n Error: %s", file2Path, e.msg)); 1869 1870 try 1871 { 1872 auto ofile3 = File(file3Path, "w"); 1873 ofile3.write(file3Data); 1874 } 1875 catch (Exception e) assert(false, format("Failed to write file: %s.\n Error: %s", file3Path, e.msg)); 1876 1877 auto allData = file1Data ~ file2Data ~ file3Data; 1878 auto expectedLines = allData.splitter('\n').array[0 .. $-1]; 1879 1880 auto file2DataNoHeader = (file2Data.find('\n'))[1 .. $]; 1881 auto file3DataNoHeader = (file3Data.find('\n'))[1 .. $]; 1882 auto allDataUsingHeader = file1Data ~ file2DataNoHeader ~ file3DataNoHeader; 1883 auto expectedLinesUsingHeader = allDataUsingHeader.splitter('\n').array[0 .. $-1]; 1884 1885 assert(expectedLines.length == expectedLinesUsingHeader.length + 2); 1886 1887 TsvSampleOptions cmdoptNoHeader; 1888 auto noHeaderCmdArgs = ["unittest", file1Path]; 1889 auto r1 = cmdoptNoHeader.processArgs(noHeaderCmdArgs); 1890 assert(r1[0], format("Invalid command lines arg: '%s'.", noHeaderCmdArgs)); 1891 1892 TsvSampleOptions cmdoptYesHeader; 1893 auto yesHeaderCmdArgs = ["unittest", "--header", file1Path]; 1894 auto r2 = cmdoptYesHeader.processArgs(yesHeaderCmdArgs); 1895 assert(r2[0], format("Invalid command lines arg: '%s'.", yesHeaderCmdArgs)); 1896 1897 auto outputStream = appender!(char[])(); 1898 1899 { 1900 /* Reading as single blocks. */ 1901 ubyte[] rawReadBuffer = new ubyte[256]; 1902 InputBlock[] blocks; 1903 auto blocksAppender = appender(&blocks); 1904 blocksAppender.reserve(3); 1905 foreach (f; [ file1Path, file2Path, file3Path ]) 1906 { 1907 auto ifile = f.File; 1908 ulong filesize = ifile.size; 1909 if (filesize == ulong.max) filesize = 1000; 1910 readFileDataAsOneBlock(f, ifile, filesize, blocksAppender, rawReadBuffer); 1911 } 1912 auto inputLines = 1913 identifyInputLines!(No.hasRandomValue, No.isWeighted)( 1914 blocks, cmdoptNoHeader); 1915 1916 assert(equal!((a, b) => a.data == b)(inputLines, expectedLines)); 1917 } 1918 1919 { 1920 /* Reading as multiple blocks. */ 1921 foreach (size_t searchSize; [ 0, 1, 2, 64 ]) 1922 { 1923 foreach (size_t blockSize; [ 1, 2, 16, 64, 256 ]) 1924 { 1925 foreach (size_t readSize; [ 1, 2, 8, 32 ]) 1926 { 1927 ubyte[] rawReadBuffer = new ubyte[readSize]; 1928 InputBlock[] blocks; 1929 auto blocksAppender = appender(&blocks); 1930 blocksAppender.reserve(3); 1931 foreach (f; [ file1Path, file2Path, file3Path ]) 1932 { 1933 auto ifile = f.File; 1934 readFileDataAsMultipleBlocks(f, ifile, blocksAppender, 1935 rawReadBuffer, blockSize, searchSize); 1936 } 1937 auto inputLines = 1938 identifyInputLines!(No.hasRandomValue, No.isWeighted)( 1939 blocks, cmdoptNoHeader); 1940 1941 assert(equal!((a, b) => a.data == b)(inputLines, expectedLines)); 1942 } 1943 } 1944 } 1945 } 1946 version(none) { 1947 { 1948 /* Reading as multiple blocks, with header processing. */ 1949 const size_t readSize = 32; 1950 const size_t blockSize = 48; 1951 const size_t searchSize = 16; 1952 1953 ubyte[] rawReadBuffer = new ubyte[readSize]; 1954 InputBlock[] blocks; 1955 auto blocksAppender = appender(&blocks); 1956 blocksAppender.reserve(3); 1957 foreach (f; [ file1Path, file2Path, file3Path ]) 1958 { 1959 auto ifile = f.File; 1960 readFileDataAsMultipleBlocks(f, ifile, blocksAppender, 1961 rawReadBuffer, blockSize, searchSize); 1962 } 1963 auto inputLines = 1964 identifyInputLines!(No.hasRandomValue, No.isWeighted)( 1965 blocks, cmdoptYesHeader); 1966 1967 assert(outputStream.data == expectedLinesUsingHeader[0] ~ '\n'); 1968 assert(equal!((a, b) => a.data == b)(inputLines, expectedLinesUsingHeader[1 .. $])); 1969 } 1970 } 1971 } 1972 1973 /** Write a floating point random value to an output stream. 1974 * 1975 * This routine is used for floating point random value printing. This routine writes 1976 * 17 significant digits, the range available in doubles. This routine prefers decimal 1977 * format, without exponents. It will generate somewhat large precision numbers, 1978 * currently up to 28 digits, before switching to exponents. 1979 * 1980 * The primary reason for this approach is to enable faster sorting on random values 1981 * by GNU sort and similar external sorting programs. GNU sort is dramatically faster 1982 * on decimal format numeric sorts ('n' switch) than general numeric sorts ('g' switch). 1983 * The 'general numeric' handles exponential notation. The difference is 5-10x. 1984 * 1985 * Random values generated by Bernoulli sampling are nearly always greater than 1e-12. 1986 * No examples less than 1e-09 were seen in hundred of millions of trials. Similar 1987 * results were seen with weighted sampling with integer weights. The same is not true 1988 * with floating point weights. These produce quite large exponents. However, even 1989 * for floating point weights this can be useful. For random weights [0,1] less than 5% 1990 * will be less than 1e-12 and use exponential notation. 1991 */ 1992 void formatRandomValue(OutputRange)(auto ref OutputRange outputStream, double value) 1993 if (isOutputRange!(OutputRange, char)) 1994 { 1995 import std.format : formatValue, singleSpec; 1996 1997 immutable spec17f = singleSpec("%.17f"); 1998 immutable spec18f = singleSpec("%.18f"); 1999 immutable spec19f = singleSpec("%.19f"); 2000 immutable spec20f = singleSpec("%.20f"); 2001 immutable spec21f = singleSpec("%.21f"); 2002 immutable spec22f = singleSpec("%.22f"); 2003 immutable spec23f = singleSpec("%.23f"); 2004 immutable spec24f = singleSpec("%.24f"); 2005 immutable spec25f = singleSpec("%.25f"); 2006 immutable spec26f = singleSpec("%.26f"); 2007 immutable spec27f = singleSpec("%.27f"); 2008 immutable spec28f = singleSpec("%.28f"); 2009 2010 immutable spec17g = singleSpec("%.17g"); 2011 2012 immutable formatSpec = 2013 (value >= 1e-01) ? spec17f : 2014 (value >= 1e-02) ? spec18f : 2015 (value >= 1e-03) ? spec19f : 2016 (value >= 1e-04) ? spec20f : 2017 (value >= 1e-05) ? spec21f : 2018 (value >= 1e-06) ? spec22f : 2019 (value >= 1e-07) ? spec23f : 2020 (value >= 1e-08) ? spec24f : 2021 (value >= 1e-09) ? spec25f : 2022 (value >= 1e-10) ? spec26f : 2023 (value >= 1e-11) ? spec27f : 2024 (value >= 1e-12) ? spec28f : spec17g; 2025 2026 outputStream.formatValue(value, formatSpec); 2027 } 2028 2029 @safe unittest 2030 { 2031 void testFormatValue(double value, string expected) 2032 { 2033 import std.array : appender; 2034 2035 auto s = appender!string(); 2036 s.formatRandomValue(value); 2037 assert(s.data == expected, 2038 format("[testFormatValue] value: %g; expected: %s; actual: %s", value, expected, s.data)); 2039 } 2040 2041 testFormatValue(1.0, "1.00000000000000000"); 2042 testFormatValue(0.1, "0.10000000000000001"); 2043 testFormatValue(0.01, "0.010000000000000000"); 2044 testFormatValue(1e-03, "0.0010000000000000000"); 2045 testFormatValue(1e-04, "0.00010000000000000000"); 2046 testFormatValue(1e-05, "0.000010000000000000001"); 2047 testFormatValue(1e-06, "0.0000010000000000000000"); 2048 testFormatValue(1e-07, "0.00000010000000000000000"); 2049 testFormatValue(1e-08, "0.000000010000000000000000"); 2050 testFormatValue(1e-09, "0.0000000010000000000000001"); 2051 testFormatValue(1e-10, "0.00000000010000000000000000"); 2052 testFormatValue(1e-11, "0.000000000009999999999999999"); 2053 testFormatValue(1e-12, "0.0000000000010000000000000000"); 2054 testFormatValue(1e-13, "1e-13"); 2055 testFormatValue(1e-14, "1e-14"); 2056 testFormatValue(12345678901234567e-15, "12.34567890123456735"); 2057 testFormatValue(12345678901234567e-16, "1.23456789012345669"); 2058 testFormatValue(12345678901234567e-17, "0.12345678901234566"); 2059 testFormatValue(12345678901234567e-18, "0.012345678901234567"); 2060 testFormatValue(12345678901234567e-19, "0.0012345678901234567"); 2061 testFormatValue(12345678901234567e-20, "0.00012345678901234567"); 2062 testFormatValue(12345678901234567e-21, "0.000012345678901234568"); 2063 testFormatValue(12345678901234567e-22, "0.0000012345678901234567"); 2064 testFormatValue(12345678901234567e-23, "0.00000012345678901234566"); 2065 testFormatValue(12345678901234567e-24, "0.000000012345678901234567"); 2066 testFormatValue(12345678901234567e-25, "0.0000000012345678901234566"); 2067 testFormatValue(12345678901234567e-26, "0.00000000012345678901234568"); 2068 testFormatValue(12345678901234567e-27, "0.000000000012345678901234567"); 2069 testFormatValue(12345678901234567e-28, "0.0000000000012345678901234567"); 2070 testFormatValue(12345678901234567e-29, "1.2345678901234566e-13"); 2071 } 2072 2073 2074 /** Convenience function for extracting a single field from a line. See 2075 * [tsv_utils.common.utils.getTsvFieldValue] for details. This wrapper creates error 2076 * text tailored for this program. 2077 */ 2078 import std.traits : isSomeChar; 2079 T getFieldValue(T, C)(const C[] line, size_t fieldIndex, C delim, string filename, ulong lineNum) pure @safe 2080 if (isSomeChar!C) 2081 { 2082 import std.conv : ConvException, to; 2083 import tsv_utils.common.utils : getTsvFieldValue; 2084 2085 T val; 2086 try 2087 { 2088 val = getTsvFieldValue!T(line, fieldIndex, delim); 2089 } 2090 catch (ConvException exc) 2091 { 2092 throw new Exception( 2093 format("Could not process line: %s\n File: %s Line: %s%s", 2094 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum, 2095 (lineNum == 1) ? "\n Is this a header line? Use --H|header to skip." : "")); 2096 } 2097 catch (Exception exc) 2098 { 2099 /* Not enough fields on the line. */ 2100 throw new Exception( 2101 format("Could not process line: %s\n File: %s Line: %s", 2102 exc.msg, (filename == "-") ? "Standard Input" : filename, lineNum)); 2103 } 2104 2105 return val; 2106 } 2107 2108 @safe unittest 2109 { 2110 /* getFieldValue unit tests. getTsvFieldValue has it's own tests. 2111 * These tests make basic sanity checks on the getFieldValue wrapper. 2112 */ 2113 import std.exception; 2114 2115 assert(getFieldValue!double("123", 0, '\t', "unittest", 1) == 123); 2116 assert(getFieldValue!double("123.4", 0, '\t', "unittest", 1) == 123.4); 2117 assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 1)); 2118 assertThrown(getFieldValue!double("abc", 0, '\t', "unittest", 2)); 2119 assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 1)); 2120 assertThrown(getFieldValue!double("123", 1, '\t', "unittest", 2)); 2121 } 2122 2123 /* Unit tests for the main program start here. 2124 * 2125 * Portability note: Many of the tests here rely on generating consistent random numbers 2126 * across different platforms when using the same random seed. So far this has succeeded 2127 * on several different platform, compiler, and library versions. However, it is certainly 2128 * possible this condition will not hold on other platforms. 2129 * 2130 * For tsv-sample, this portability implies generating the same results on different 2131 * platforms when using the same random seed. This is NOT part of tsv-sample guarantees, 2132 * but it is convenient for testing. If platforms are identified that do not generate 2133 * the same results these tests will need to be adjusted. 2134 */ 2135 version(unittest) 2136 { 2137 /* Unit test helper functions. */ 2138 2139 import tsv_utils.common.unittest_utils; // tsv unit test helpers, from common/src/. 2140 import std.conv : to; 2141 2142 void testTsvSample(string[] cmdArgs, string[][] expected) 2143 { 2144 import std.array : appender; 2145 2146 assert(cmdArgs.length > 0, "[testTsvSample] cmdArgs must not be empty."); 2147 2148 auto formatAssertMessage(T...)(string msg, T formatArgs) 2149 { 2150 auto formatString = "[testTsvSample] %s: " ~ msg; 2151 return format(formatString, cmdArgs[0], formatArgs); 2152 } 2153 2154 TsvSampleOptions cmdopt; 2155 auto savedCmdArgs = cmdArgs.to!string; 2156 auto r = cmdopt.processArgs(cmdArgs); 2157 assert(r[0], formatAssertMessage("Invalid command lines arg: '%s'.", savedCmdArgs)); 2158 auto output = appender!(char[])(); 2159 2160 tsvSample(cmdopt, output); // This invokes the main code line. 2161 2162 auto expectedOutput = expected.tsvDataToString; 2163 2164 assert(output.data == expectedOutput, 2165 formatAssertMessage( 2166 "Result != expected:\n=====Expected=====\n%s=====Actual=======\n%s==================", 2167 expectedOutput.to!string, output.data.to!string)); 2168 } 2169 } 2170 2171 unittest 2172 { 2173 import std.path : buildPath; 2174 import std.file : rmdirRecurse; 2175 2176 auto testDir = makeUnittestTempDir("tsv_sample"); 2177 scope(exit) testDir.rmdirRecurse; 2178 2179 /* Tabular data sets and expected results use the built-in static seed. 2180 * Tests are run by writing the data set to a file, then calling the main 2181 * routine to process. The function testTsvSample plays the role of the 2182 * main program. Rather than writing to expected output, the results are 2183 * matched against expected. The expected results were verified by hand 2184 * prior to inclusion in the test. 2185 * 2186 * The initial part of this section is simply setting up data files and 2187 * expected results. 2188 * 2189 * Expected results naming conventions: 2190 * - Prefix: dataNxMExpected. N and M are numbers. e.g. data3x6Expected 2191 * - Sampling Type (required): Permute (Shuffle), Sample, Replace, Bernoulli, Distinct 2192 * - Compatibility: Compat, AlgoR, Skip, Swap, Inorder 2193 * - Weight Field: Wt<num>, e.g. Wt3 2194 * - Sample Size: Num<num>, eg. Num3 2195 * - Seed Value: V<num>, eg. V77 2196 * - Key Field: K<num>, e.g. K2 2197 * - Probability: P<num>, e.g P05 (5%) 2198 * - Printing Probabilities: Probs 2199 * - Printing Probs in order: ProbsInorder 2200 * - Printing Probs with custom header: RVCustom 2201 */ 2202 2203 /* Empty file. */ 2204 string[][] dataEmpty = []; 2205 string fpath_dataEmpty = buildPath(testDir, "dataEmpty.tsv"); 2206 writeUnittestTsvFile(fpath_dataEmpty, dataEmpty); 2207 2208 /* 3x1, header only. */ 2209 string[][] data3x0 = [["field_a", "field_b", "field_c"]]; 2210 string fpath_data3x0 = buildPath(testDir, "data3x0.tsv"); 2211 writeUnittestTsvFile(fpath_data3x0, data3x0); 2212 2213 /* 3x1 */ 2214 string[][] data3x1 = 2215 [["field_a", "field_b", "field_c"], 2216 ["tan", "タン", "8.5"]]; 2217 2218 string fpath_data3x1 = buildPath(testDir, "data3x1.tsv"); 2219 string fpath_data3x1_noheader = buildPath(testDir, "data3x1_noheader.tsv"); 2220 writeUnittestTsvFile(fpath_data3x1, data3x1); 2221 writeUnittestTsvFile(fpath_data3x1_noheader, data3x1[1 .. $]); 2222 2223 string[][] data3x1ExpectedReplaceNum3 = 2224 [["field_a", "field_b", "field_c"], 2225 ["tan", "タン", "8.5"], 2226 ["tan", "タン", "8.5"], 2227 ["tan", "タン", "8.5"]]; 2228 2229 /* 3x2 */ 2230 string[][] data3x2 = 2231 [["field_a", "field_b", "field_c"], 2232 ["brown", "褐色", "29.2"], 2233 ["gray", "グレー", "6.2"]]; 2234 2235 string fpath_data3x2 = buildPath(testDir, "data3x2.tsv"); 2236 string fpath_data3x2_noheader = buildPath(testDir, "data3x2_noheader.tsv"); 2237 writeUnittestTsvFile(fpath_data3x2, data3x2); 2238 writeUnittestTsvFile(fpath_data3x2_noheader, data3x2[1 .. $]); 2239 2240 string[][] data3x2PermuteCompat = 2241 [["field_a", "field_b", "field_c"], 2242 ["gray", "グレー", "6.2"], 2243 ["brown", "褐色", "29.2"]]; 2244 2245 string[][] data3x2PermuteShuffle = 2246 [["field_a", "field_b", "field_c"], 2247 ["gray", "グレー", "6.2"], 2248 ["brown", "褐色", "29.2"]]; 2249 2250 /* 3x3 */ 2251 string[][] data3x3 = 2252 [["field_a", "field_b", "field_c"], 2253 ["orange", "オレンジ", "2.5"], 2254 ["pink", "ピンク", "1.1"], 2255 ["purple", "紫の", "42"]]; 2256 2257 string fpath_data3x3 = buildPath(testDir, "data3x3.tsv"); 2258 string fpath_data3x3_noheader = buildPath(testDir, "data3x3_noheader.tsv"); 2259 writeUnittestTsvFile(fpath_data3x3, data3x3); 2260 writeUnittestTsvFile(fpath_data3x3_noheader, data3x3[1 .. $]); 2261 2262 string[][] data3x3ExpectedPermuteCompat = 2263 [["field_a", "field_b", "field_c"], 2264 ["purple", "紫の", "42"], 2265 ["pink", "ピンク", "1.1"], 2266 ["orange", "オレンジ", "2.5"]]; 2267 2268 string[][] data3x3ExpectedPermuteSwap = 2269 [["field_a", "field_b", "field_c"], 2270 ["purple", "紫の", "42"], 2271 ["orange", "オレンジ", "2.5"], 2272 ["pink", "ピンク", "1.1"]]; 2273 2274 /* 3x6 */ 2275 string[][] data3x6 = 2276 [["field_a", "field_b", "field_c"], 2277 ["red", "赤", "23.8"], 2278 ["green", "緑", "0.0072"], 2279 ["white", "白", "1.65"], 2280 ["yellow", "黄", "12"], 2281 ["blue", "青", "12"], 2282 ["black", "黒", "0.983"]]; 2283 string fpath_data3x6 = buildPath(testDir, "data3x6.tsv"); 2284 string fpath_data3x6_noheader = buildPath(testDir, "data3x6_noheader.tsv"); 2285 writeUnittestTsvFile(fpath_data3x6, data3x6); 2286 writeUnittestTsvFile(fpath_data3x6_noheader, data3x6[1 .. $]); 2287 2288 // Randomization, all lines 2289 string[][] data3x6ExpectedPermuteCompat = 2290 [["field_a", "field_b", "field_c"], 2291 ["yellow", "黄", "12"], 2292 ["black", "黒", "0.983"], 2293 ["blue", "青", "12"], 2294 ["white", "白", "1.65"], 2295 ["green", "緑", "0.0072"], 2296 ["red", "赤", "23.8"]]; 2297 2298 string[][] data3x6ExpectedPermuteSwap = 2299 [["field_a", "field_b", "field_c"], 2300 ["black", "黒", "0.983"], 2301 ["green", "緑", "0.0072"], 2302 ["red", "赤", "23.8"], 2303 ["yellow", "黄", "12"], 2304 ["white", "白", "1.65"], 2305 ["blue", "青", "12"]]; 2306 2307 string[][] data3x6ExpectedPermuteCompatProbs = 2308 [["random_value", "field_a", "field_b", "field_c"], 2309 ["0.96055546286515892", "yellow", "黄", "12"], 2310 ["0.75710153928957880", "black", "黒", "0.983"], 2311 ["0.52525980887003243", "blue", "青", "12"], 2312 ["0.49287854949943721", "white", "白", "1.65"], 2313 ["0.15929344086907804", "green", "緑", "0.0072"], 2314 ["0.010968807619065046", "red", "赤", "23.8"]]; 2315 2316 /* Note: data3x6ExpectedSampleAlgoRNum6 is identical to data3x6ExpectedPermuteSwap because 2317 * both are effectively the same algorithm given that --num is data length. Both read 2318 * in the full data in order then call randomShuffle. 2319 */ 2320 string[][] data3x6ExpectedSampleAlgoRNum6 = 2321 [["field_a", "field_b", "field_c"], 2322 ["black", "黒", "0.983"], 2323 ["green", "緑", "0.0072"], 2324 ["red", "赤", "23.8"], 2325 ["yellow", "黄", "12"], 2326 ["white", "白", "1.65"], 2327 ["blue", "青", "12"]]; 2328 2329 string[][] data3x6ExpectedSampleAlgoRNum5 = 2330 [["field_a", "field_b", "field_c"], 2331 ["red", "赤", "23.8"], 2332 ["black", "黒", "0.983"], 2333 ["white", "白", "1.65"], 2334 ["green", "緑", "0.0072"], 2335 ["yellow", "黄", "12"]]; 2336 2337 string[][] data3x6ExpectedSampleAlgoRNum4 = 2338 [["field_a", "field_b", "field_c"], 2339 ["blue", "青", "12"], 2340 ["green", "緑", "0.0072"], 2341 ["black", "黒", "0.983"], 2342 ["white", "白", "1.65"]]; 2343 2344 string[][] data3x6ExpectedSampleAlgoRNum3 = 2345 [["field_a", "field_b", "field_c"], 2346 ["red", "赤", "23.8"], 2347 ["black", "黒", "0.983"], 2348 ["green", "緑", "0.0072"]]; 2349 2350 string[][] data3x6ExpectedSampleAlgoRNum2 = 2351 [["field_a", "field_b", "field_c"], 2352 ["black", "黒", "0.983"], 2353 ["red", "赤", "23.8"]]; 2354 2355 string[][] data3x6ExpectedSampleAlgoRNum1 = 2356 [["field_a", "field_b", "field_c"], 2357 ["green", "緑", "0.0072"]]; 2358 2359 /* Inorder versions. */ 2360 string[][] data3x6ExpectedSampleAlgoRNum6Inorder = 2361 [["field_a", "field_b", "field_c"], 2362 ["red", "赤", "23.8"], 2363 ["green", "緑", "0.0072"], 2364 ["white", "白", "1.65"], 2365 ["yellow", "黄", "12"], 2366 ["blue", "青", "12"], 2367 ["black", "黒", "0.983"]]; 2368 2369 string[][] data3x6ExpectedSampleAlgoRNum5Inorder = 2370 [["field_a", "field_b", "field_c"], 2371 ["red", "赤", "23.8"], 2372 ["green", "緑", "0.0072"], 2373 ["white", "白", "1.65"], 2374 ["yellow", "黄", "12"], 2375 ["black", "黒", "0.983"]]; 2376 2377 string[][] data3x6ExpectedSampleAlgoRNum4Inorder = 2378 [["field_a", "field_b", "field_c"], 2379 ["green", "緑", "0.0072"], 2380 ["white", "白", "1.65"], 2381 ["blue", "青", "12"], 2382 ["black", "黒", "0.983"]]; 2383 2384 string[][] data3x6ExpectedSampleAlgoRNum3Inorder = 2385 [["field_a", "field_b", "field_c"], 2386 ["red", "赤", "23.8"], 2387 ["green", "緑", "0.0072"], 2388 ["black", "黒", "0.983"]]; 2389 2390 string[][] data3x6ExpectedSampleAlgoRNum2Inorder = 2391 [["field_a", "field_b", "field_c"], 2392 ["red", "赤", "23.8"], 2393 ["black", "黒", "0.983"]]; 2394 2395 string[][] data3x6ExpectedSampleAlgoRNum1Inorder = 2396 [["field_a", "field_b", "field_c"], 2397 ["green", "緑", "0.0072"]]; 2398 2399 /* Reservoir inorder */ 2400 string[][] data3x6ExpectedSampleCompatNum6Inorder = 2401 [["field_a", "field_b", "field_c"], 2402 ["red", "赤", "23.8"], 2403 ["green", "緑", "0.0072"], 2404 ["white", "白", "1.65"], 2405 ["yellow", "黄", "12"], 2406 ["blue", "青", "12"], 2407 ["black", "黒", "0.983"]]; 2408 2409 string[][] data3x6ExpectedSampleCompatNum5Inorder = 2410 [["field_a", "field_b", "field_c"], 2411 ["green", "緑", "0.0072"], 2412 ["white", "白", "1.65"], 2413 ["yellow", "黄", "12"], 2414 ["blue", "青", "12"], 2415 ["black", "黒", "0.983"]]; 2416 2417 string[][] data3x6ExpectedSampleCompatNum4Inorder = 2418 [["field_a", "field_b", "field_c"], 2419 ["white", "白", "1.65"], 2420 ["yellow", "黄", "12"], 2421 ["blue", "青", "12"], 2422 ["black", "黒", "0.983"]]; 2423 2424 string[][] data3x6ExpectedSampleCompatNum3Inorder = 2425 [["field_a", "field_b", "field_c"], 2426 ["yellow", "黄", "12"], 2427 ["blue", "青", "12"], 2428 ["black", "黒", "0.983"]]; 2429 2430 string[][] data3x6ExpectedSampleCompatNum2Inorder = 2431 [["field_a", "field_b", "field_c"], 2432 ["yellow", "黄", "12"], 2433 ["black", "黒", "0.983"]]; 2434 2435 string[][] data3x6ExpectedSampleCompatNum1Inorder = 2436 [["field_a", "field_b", "field_c"], 2437 ["yellow", "黄", "12"]]; 2438 2439 2440 /* Reservoir inorder with probabilities. */ 2441 string[][] data3x6ExpectedSampleCompatNum6ProbsInorder = 2442 [["random_value", "field_a", "field_b", "field_c"], 2443 ["0.010968807619065046", "red", "赤", "23.8"], 2444 ["0.15929344086907804", "green", "緑", "0.0072"], 2445 ["0.49287854949943721", "white", "白", "1.65"], 2446 ["0.96055546286515892", "yellow", "黄", "12"], 2447 ["0.52525980887003243", "blue", "青", "12"], 2448 ["0.75710153928957880", "black", "黒", "0.983"]]; 2449 2450 string[][] data3x6ExpectedSampleCompatNum5ProbsInorder = 2451 [["random_value", "field_a", "field_b", "field_c"], 2452 ["0.15929344086907804", "green", "緑", "0.0072"], 2453 ["0.49287854949943721", "white", "白", "1.65"], 2454 ["0.96055546286515892", "yellow", "黄", "12"], 2455 ["0.52525980887003243", "blue", "青", "12"], 2456 ["0.75710153928957880", "black", "黒", "0.983"]]; 2457 2458 string[][] data3x6ExpectedSampleCompatNum4ProbsInorder = 2459 [["random_value", "field_a", "field_b", "field_c"], 2460 ["0.49287854949943721", "white", "白", "1.65"], 2461 ["0.96055546286515892", "yellow", "黄", "12"], 2462 ["0.52525980887003243", "blue", "青", "12"], 2463 ["0.75710153928957880", "black", "黒", "0.983"]]; 2464 2465 string[][] data3x6ExpectedSampleCompatNum3ProbsInorder = 2466 [["random_value", "field_a", "field_b", "field_c"], 2467 ["0.96055546286515892", "yellow", "黄", "12"], 2468 ["0.52525980887003243", "blue", "青", "12"], 2469 ["0.75710153928957880", "black", "黒", "0.983"]]; 2470 2471 string[][] data3x6ExpectedSampleCompatNum2ProbsInorder = 2472 [["random_value", "field_a", "field_b", "field_c"], 2473 ["0.96055546286515892", "yellow", "黄", "12"], 2474 ["0.75710153928957880", "black", "黒", "0.983"]]; 2475 2476 string[][] data3x6ExpectedSampleCompatNum1ProbsInorder = 2477 [["random_value", "field_a", "field_b", "field_c"], 2478 ["0.96055546286515892", "yellow", "黄", "12"]]; 2479 2480 string[][] data3x6ExpectedWt3Num6Inorder = 2481 [["field_a", "field_b", "field_c"], 2482 ["red", "赤", "23.8"], 2483 ["green", "緑", "0.0072"], 2484 ["white", "白", "1.65"], 2485 ["yellow", "黄", "12"], 2486 ["blue", "青", "12"], 2487 ["black", "黒", "0.983"]]; 2488 2489 string[][] data3x6ExpectedWt3Num5Inorder = 2490 [["field_a", "field_b", "field_c"], 2491 ["green", "緑", "0.0072"], 2492 ["white", "白", "1.65"], 2493 ["yellow", "黄", "12"], 2494 ["blue", "青", "12"], 2495 ["black", "黒", "0.983"]]; 2496 2497 string[][] data3x6ExpectedWt3Num4Inorder = 2498 [["field_a", "field_b", "field_c"], 2499 ["white", "白", "1.65"], 2500 ["yellow", "黄", "12"], 2501 ["blue", "青", "12"], 2502 ["black", "黒", "0.983"]]; 2503 2504 string[][] data3x6ExpectedWt3Num3Inorder = 2505 [["field_a", "field_b", "field_c"], 2506 ["yellow", "黄", "12"], 2507 ["blue", "青", "12"], 2508 ["black", "黒", "0.983"]]; 2509 2510 string[][] data3x6ExpectedWt3Num2Inorder = 2511 [["field_a", "field_b", "field_c"], 2512 ["yellow", "黄", "12"], 2513 ["black", "黒", "0.983"]]; 2514 2515 string[][] data3x6ExpectedWt3Num1Inorder = 2516 [["field_a", "field_b", "field_c"], 2517 ["yellow", "黄", "12"]]; 2518 2519 2520 string[][] data3x6ExpectedBernoulliProbsP100 = 2521 [["random_value", "field_a", "field_b", "field_c"], 2522 ["0.010968807619065046", "red", "赤", "23.8"], 2523 ["0.15929344086907804", "green", "緑", "0.0072"], 2524 ["0.49287854949943721", "white", "白", "1.65"], 2525 ["0.96055546286515892", "yellow", "黄", "12"], 2526 ["0.52525980887003243", "blue", "青", "12"], 2527 ["0.75710153928957880", "black", "黒", "0.983"]]; 2528 2529 string[][] data3x6ExpectedBernoulliCompatProbsP60 = 2530 [["random_value", "field_a", "field_b", "field_c"], 2531 ["0.010968807619065046", "red", "赤", "23.8"], 2532 ["0.15929344086907804", "green", "緑", "0.0072"], 2533 ["0.49287854949943721", "white", "白", "1.65"], 2534 ["0.52525980887003243", "blue", "青", "12"]]; 2535 2536 string[][] data3x6ExpectedBernoulliSkipP40 = 2537 [["field_a", "field_b", "field_c"], 2538 ["red", "赤", "23.8"], 2539 ["green", "緑", "0.0072"], 2540 ["yellow", "黄", "12"]]; 2541 2542 string[][] data3x6ExpectedBernoulliCompatP60 = 2543 [["field_a", "field_b", "field_c"], 2544 ["red", "赤", "23.8"], 2545 ["green", "緑", "0.0072"], 2546 ["white", "白", "1.65"], 2547 ["blue", "青", "12"]]; 2548 2549 string[][] data3x6ExpectedDistinctK1K3P60 = 2550 [["field_a", "field_b", "field_c"], 2551 ["green", "緑", "0.0072"], 2552 ["white", "白", "1.65"], 2553 ["blue", "青", "12"]]; 2554 2555 string[][] data3x6ExpectedDistinctK1K3P60Probs = 2556 [["random_value", "field_a", "field_b", "field_c"], 2557 ["0", "green", "緑", "0.0072"], 2558 ["0", "white", "白", "1.65"], 2559 ["0", "blue", "青", "12"]]; 2560 2561 string[][] data3x6ExpectedDistinctK1K3P60ProbsRVCustom = 2562 [["custom_random_value_header", "field_a", "field_b", "field_c"], 2563 ["0", "green", "緑", "0.0072"], 2564 ["0", "white", "白", "1.65"], 2565 ["0", "blue", "青", "12"]]; 2566 2567 string[][] data3x6ExpectedDistinctK2P2ProbsInorder = 2568 [["random_value", "field_a", "field_b", "field_c"], 2569 ["1", "red", "赤", "23.8"], 2570 ["0", "green", "緑", "0.0072"], 2571 ["0", "white", "白", "1.65"], 2572 ["1", "yellow", "黄", "12"], 2573 ["3", "blue", "青", "12"], 2574 ["2", "black", "黒", "0.983"]]; 2575 2576 string[][] data3x6ExpectedPermuteWt3Probs = 2577 [["random_value", "field_a", "field_b", "field_c"], 2578 ["0.99665198757645390", "yellow", "黄", "12"], 2579 ["0.94775884809836686", "blue", "青", "12"], 2580 ["0.82728234682286661", "red", "赤", "23.8"], 2581 ["0.75346697377181959", "black", "黒", "0.983"], 2582 ["0.65130103496422487", "white", "白", "1.65"], 2583 ["1.5636943712879866e-111", "green", "緑", "0.0072"]]; 2584 2585 string[][] data3x6ExpectedWt3ProbsInorder = 2586 [["random_value", "field_a", "field_b", "field_c"], 2587 ["0.82728234682286661", "red", "赤", "23.8"], 2588 ["1.5636943712879866e-111", "green", "緑", "0.0072"], 2589 ["0.65130103496422487", "white", "白", "1.65"], 2590 ["0.99665198757645390", "yellow", "黄", "12"], 2591 ["0.94775884809836686", "blue", "青", "12"], 2592 ["0.75346697377181959", "black", "黒", "0.983"]]; 2593 2594 string[][] data3x6ExpectedPermuteWt3 = 2595 [["field_a", "field_b", "field_c"], 2596 ["yellow", "黄", "12"], 2597 ["blue", "青", "12"], 2598 ["red", "赤", "23.8"], 2599 ["black", "黒", "0.983"], 2600 ["white", "白", "1.65"], 2601 ["green", "緑", "0.0072"]]; 2602 2603 2604 string[][] data3x6ExpectedReplaceNum10 = 2605 [["field_a", "field_b", "field_c"], 2606 ["black", "黒", "0.983"], 2607 ["green", "緑", "0.0072"], 2608 ["green", "緑", "0.0072"], 2609 ["red", "赤", "23.8"], 2610 ["yellow", "黄", "12"], 2611 ["red", "赤", "23.8"], 2612 ["white", "白", "1.65"], 2613 ["yellow", "黄", "12"], 2614 ["yellow", "黄", "12"], 2615 ["white", "白", "1.65"], 2616 ]; 2617 2618 string[][] data3x6ExpectedReplaceNum10V77 = 2619 [["field_a", "field_b", "field_c"], 2620 ["black", "黒", "0.983"], 2621 ["red", "赤", "23.8"], 2622 ["black", "黒", "0.983"], 2623 ["yellow", "黄", "12"], 2624 ["green", "緑", "0.0072"], 2625 ["green", "緑", "0.0072"], 2626 ["green", "緑", "0.0072"], 2627 ["yellow", "黄", "12"], 2628 ["blue", "青", "12"], 2629 ["white", "白", "1.65"], 2630 ]; 2631 2632 /* Using a different static seed. */ 2633 string[][] data3x6ExpectedPermuteCompatV41Probs = 2634 [["random_value", "field_a", "field_b", "field_c"], 2635 ["0.68057272653095424", "green", "緑", "0.0072"], 2636 ["0.67681624367833138", "blue", "青", "12"], 2637 ["0.32097338931635022", "yellow", "黄", "12"], 2638 ["0.25092361867427826", "red", "赤", "23.8"], 2639 ["0.15535934292711318", "black", "黒", "0.983"], 2640 ["0.046095821075141430", "white", "白", "1.65"]]; 2641 2642 string[][] data3x6ExpectedBernoulliCompatP60V41Probs = 2643 [["random_value", "field_a", "field_b", "field_c"], 2644 ["0.25092361867427826", "red", "赤", "23.8"], 2645 ["0.046095821075141430", "white", "白", "1.65"], 2646 ["0.32097338931635022", "yellow", "黄", "12"], 2647 ["0.15535934292711318", "black", "黒", "0.983"]]; 2648 2649 string[][] data3x6ExpectedPermuteWt3V41Probs = 2650 [["random_value", "field_a", "field_b", "field_c"], 2651 ["0.96799377498910666", "blue", "青", "12"], 2652 ["0.94356245792573568", "red", "赤", "23.8"], 2653 ["0.90964601024271996", "yellow", "黄", "12"], 2654 ["0.15491658409260103", "white", "白", "1.65"], 2655 ["0.15043620392537033", "black", "黒", "0.983"], 2656 ["6.1394674830701461e-24", "green", "緑", "0.0072"]]; 2657 2658 string[][] data3x6ExpectedWt3V41ProbsInorder = 2659 [["random_value", "field_a", "field_b", "field_c"], 2660 ["0.94356245792573568", "red", "赤", "23.8"], 2661 ["6.1394674830701461e-24", "green", "緑", "0.0072"], 2662 ["0.15491658409260103", "white", "白", "1.65"], 2663 ["0.90964601024271996", "yellow", "黄", "12"], 2664 ["0.96799377498910666", "blue", "青", "12"], 2665 ["0.15043620392537033", "black", "黒", "0.983"]]; 2666 2667 2668 /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */ 2669 string[][] combo1ExpectedPermuteCompat = 2670 [["field_a", "field_b", "field_c"], 2671 ["yellow", "黄", "12"], 2672 ["tan", "タン", "8.5"], 2673 ["brown", "褐色", "29.2"], 2674 ["green", "緑", "0.0072"], 2675 ["red", "赤", "23.8"], 2676 ["purple", "紫の", "42"], 2677 ["black", "黒", "0.983"], 2678 ["white", "白", "1.65"], 2679 ["gray", "グレー", "6.2"], 2680 ["blue", "青", "12"], 2681 ["pink", "ピンク", "1.1"], 2682 ["orange", "オレンジ", "2.5"]]; 2683 2684 string[][] combo1ExpectedPermuteCompatProbs = 2685 [["random_value", "field_a", "field_b", "field_c"], 2686 ["0.97088520275428891", "yellow", "黄", "12"], 2687 ["0.96055546286515892", "tan", "タン", "8.5"], 2688 ["0.81756894313730299", "brown", "褐色", "29.2"], 2689 ["0.75710153928957880", "green", "緑", "0.0072"], 2690 ["0.52525980887003243", "red", "赤", "23.8"], 2691 ["0.49287854949943721", "purple", "紫の", "42"], 2692 ["0.47081507067196071", "black", "黒", "0.983"], 2693 ["0.38388182921335101", "white", "白", "1.65"], 2694 ["0.29215990612283349", "gray", "グレー", "6.2"], 2695 ["0.24033216014504433", "blue", "青", "12"], 2696 ["0.15929344086907804", "pink", "ピンク", "1.1"], 2697 ["0.010968807619065046", "orange", "オレンジ", "2.5"]]; 2698 2699 /* Combo 1: 3x3, 3x1, 3x6, 3x2. No data files, only expected results. */ 2700 string[][] combo1ExpectedProbsInorder = 2701 [["random_value", "field_a", "field_b", "field_c"], 2702 ["0.010968807619065046", "orange", "オレンジ", "2.5"], 2703 ["0.15929344086907804", "pink", "ピンク", "1.1"], 2704 ["0.49287854949943721", "purple", "紫の", "42"], 2705 ["0.96055546286515892", "tan", "タン", "8.5"], 2706 ["0.52525980887003243", "red", "赤", "23.8"], 2707 ["0.75710153928957880", "green", "緑", "0.0072"], 2708 ["0.38388182921335101", "white", "白", "1.65"], 2709 ["0.97088520275428891", "yellow", "黄", "12"], 2710 ["0.24033216014504433", "blue", "青", "12"], 2711 ["0.47081507067196071", "black", "黒", "0.983"], 2712 ["0.81756894313730299", "brown", "褐色", "29.2"], 2713 ["0.29215990612283349", "gray", "グレー", "6.2"]]; 2714 2715 string[][] combo1ExpectedBernoulliCompatP50Probs = 2716 [["random_value", "field_a", "field_b", "field_c"], 2717 ["0.010968807619065046", "orange", "オレンジ", "2.5"], 2718 ["0.15929344086907804", "pink", "ピンク", "1.1"], 2719 ["0.49287854949943721", "purple", "紫の", "42"], 2720 ["0.38388182921335101", "white", "白", "1.65"], 2721 ["0.24033216014504433", "blue", "青", "12"], 2722 ["0.47081507067196071", "black", "黒", "0.983"], 2723 ["0.29215990612283349", "gray", "グレー", "6.2"]]; 2724 2725 string[][] combo1ExpectedBernoulliCompatP40 = 2726 [["field_a", "field_b", "field_c"], 2727 ["orange", "オレンジ", "2.5"], 2728 ["pink", "ピンク", "1.1"], 2729 ["white", "白", "1.65"], 2730 ["blue", "青", "12"], 2731 ["gray", "グレー", "6.2"]]; 2732 2733 string[][] combo1ExpectedDistinctK1P40 = 2734 [["field_a", "field_b", "field_c"], 2735 ["orange", "オレンジ", "2.5"], 2736 ["red", "赤", "23.8"], 2737 ["green", "緑", "0.0072"], 2738 ["blue", "青", "12"], 2739 ["black", "黒", "0.983"]]; 2740 2741 string[][] combo1ExpectedPermuteWt3Probs = 2742 [["random_value", "field_a", "field_b", "field_c"], 2743 ["0.99754077523718754", "yellow", "黄", "12"], 2744 ["0.99527665440088786", "tan", "タン", "8.5"], 2745 ["0.99312578945741659", "brown", "褐色", "29.2"], 2746 ["0.98329602553389361", "purple", "紫の", "42"], 2747 ["0.97330961938083660", "red", "赤", "23.8"], 2748 ["0.88797551521739648", "blue", "青", "12"], 2749 ["0.81999230489041786", "gray", "グレー", "6.2"], 2750 ["0.55975569204250941", "white", "白", "1.65"], 2751 ["0.46472135609205739", "black", "黒", "0.983"], 2752 ["0.18824582704191337", "pink", "ピンク", "1.1"], 2753 ["0.16446131853299920", "orange", "オレンジ", "2.5"], 2754 ["1.6438086931020549e-17", "green", "緑", "0.0072"]]; 2755 2756 string[][] combo1ExpectedPermuteWt3 = 2757 [["field_a", "field_b", "field_c"], 2758 ["yellow", "黄", "12"], 2759 ["tan", "タン", "8.5"], 2760 ["brown", "褐色", "29.2"], 2761 ["purple", "紫の", "42"], 2762 ["red", "赤", "23.8"], 2763 ["blue", "青", "12"], 2764 ["gray", "グレー", "6.2"], 2765 ["white", "白", "1.65"], 2766 ["black", "黒", "0.983"], 2767 ["pink", "ピンク", "1.1"], 2768 ["orange", "オレンジ", "2.5"], 2769 ["green", "緑", "0.0072"]]; 2770 2771 string[][] combo1ExpectedSampleAlgoRNum4 = 2772 [["field_a", "field_b", "field_c"], 2773 ["blue", "青", "12"], 2774 ["gray", "グレー", "6.2"], 2775 ["brown", "褐色", "29.2"], 2776 ["white", "白", "1.65"]]; 2777 2778 string[][] combo1ExpectedSampleAlgoRNum4Inorder = 2779 [["field_a", "field_b", "field_c"], 2780 ["white", "白", "1.65"], 2781 ["blue", "青", "12"], 2782 ["brown", "褐色", "29.2"], 2783 ["gray", "グレー", "6.2"]]; 2784 2785 string[][] combo1ExpectedReplaceNum10 = 2786 [["field_a", "field_b", "field_c"], 2787 ["gray", "グレー", "6.2"], 2788 ["yellow", "黄", "12"], 2789 ["yellow", "黄", "12"], 2790 ["white", "白", "1.65"], 2791 ["tan", "タン", "8.5"], 2792 ["white", "白", "1.65"], 2793 ["blue", "青", "12"], 2794 ["black", "黒", "0.983"], 2795 ["tan", "タン", "8.5"], 2796 ["purple", "紫の", "42"]]; 2797 2798 /* 1x200 - Needed for testing bernoulliSkipSampling, invoked with prob < 0.04. */ 2799 string[][] data1x200 = 2800 [["field_a"], 2801 ["000"], ["001"], ["002"], ["003"], ["004"], ["005"], ["006"], ["007"], ["008"], ["009"], 2802 ["010"], ["011"], ["012"], ["013"], ["014"], ["015"], ["016"], ["017"], ["018"], ["019"], 2803 ["020"], ["021"], ["022"], ["023"], ["024"], ["025"], ["026"], ["027"], ["028"], ["029"], 2804 ["030"], ["031"], ["032"], ["033"], ["034"], ["035"], ["036"], ["037"], ["038"], ["039"], 2805 ["040"], ["041"], ["042"], ["043"], ["044"], ["045"], ["046"], ["047"], ["048"], ["049"], 2806 ["050"], ["051"], ["052"], ["053"], ["054"], ["055"], ["056"], ["057"], ["058"], ["059"], 2807 ["060"], ["061"], ["062"], ["063"], ["064"], ["065"], ["066"], ["067"], ["068"], ["069"], 2808 ["070"], ["071"], ["072"], ["073"], ["074"], ["075"], ["076"], ["077"], ["078"], ["079"], 2809 ["080"], ["081"], ["082"], ["083"], ["084"], ["085"], ["086"], ["087"], ["088"], ["089"], 2810 ["090"], ["091"], ["092"], ["093"], ["094"], ["095"], ["096"], ["097"], ["098"], ["099"], 2811 ["100"], ["101"], ["102"], ["103"], ["104"], ["105"], ["106"], ["107"], ["108"], ["109"], 2812 ["110"], ["111"], ["112"], ["113"], ["114"], ["115"], ["116"], ["117"], ["118"], ["119"], 2813 ["120"], ["121"], ["122"], ["123"], ["124"], ["125"], ["126"], ["127"], ["128"], ["129"], 2814 ["130"], ["131"], ["132"], ["133"], ["134"], ["135"], ["136"], ["137"], ["138"], ["139"], 2815 ["140"], ["141"], ["142"], ["143"], ["144"], ["145"], ["146"], ["147"], ["148"], ["149"], 2816 ["150"], ["151"], ["152"], ["153"], ["154"], ["155"], ["156"], ["157"], ["158"], ["159"], 2817 ["160"], ["161"], ["162"], ["163"], ["164"], ["165"], ["166"], ["167"], ["168"], ["169"], 2818 ["170"], ["171"], ["172"], ["173"], ["174"], ["175"], ["176"], ["177"], ["178"], ["179"], 2819 ["180"], ["181"], ["182"], ["183"], ["184"], ["185"], ["186"], ["187"], ["188"], ["189"], 2820 ["190"], ["191"], ["192"], ["193"], ["194"], ["195"], ["196"], ["197"], ["198"], ["199"], 2821 ]; 2822 2823 string fpath_data1x200 = buildPath(testDir, "data1x200.tsv"); 2824 string fpath_data1x200_noheader = buildPath(testDir, "data1x200_noheader.tsv"); 2825 writeUnittestTsvFile(fpath_data1x200, data1x200); 2826 writeUnittestTsvFile(fpath_data1x200_noheader, data1x200[1 .. $]); 2827 2828 string[][] data1x200ExpectedBernoulliSkipV333P01 = 2829 [["field_a"], 2830 ["077"], 2831 ["119"]]; 2832 2833 string[][] data1x200ExpectedBernoulliSkipV333P02 = 2834 [["field_a"], 2835 ["038"], 2836 ["059"], 2837 ["124"], 2838 ["161"], 2839 ["162"], 2840 ["183"]]; 2841 2842 string[][] data1x200ExpectedBernoulliSkipV333P03 = 2843 [["field_a"], 2844 ["025"], 2845 ["039"], 2846 ["082"], 2847 ["107"], 2848 ["108"], 2849 ["122"], 2850 ["136"], 2851 ["166"], 2852 ["182"]]; 2853 2854 string[][] data1x200ExpectedBernoulliCompatV333P01 = 2855 [["field_a"], 2856 ["072"]]; 2857 2858 string[][] data1x200ExpectedBernoulliCompatV333P02 = 2859 [["field_a"], 2860 ["004"], 2861 ["072"]]; 2862 2863 string[][] data1x200ExpectedBernoulliCompatV333P03 = 2864 [["field_a"], 2865 ["004"], 2866 ["072"], 2867 ["181"]]; 2868 2869 /* Combo 2, for bernoulli skip sampling: 3x0, 3x1, 1x200, empty, 1x10. No data files, 2870 * only expected results. The header is from 3x0, the results are offset 1-position 2871 * from data1x200ExpectedBernoulliSkipV333P03 due to insertion of a single preceding line. 2872 */ 2873 string[][] combo2ExpectedBernoulliSkipV333P03 = 2874 [["field_a", "field_b", "field_c"], 2875 ["024"], 2876 ["038"], 2877 ["081"], 2878 ["106"], 2879 ["107"], 2880 ["121"], 2881 ["135"], 2882 ["165"], 2883 ["181"]]; 2884 2885 2886 /* 1x10 - Simple 1-column file. */ 2887 string[][] data1x10 = 2888 [["field_a"], ["1"], ["2"], ["3"], ["4"], ["5"], ["6"], ["7"], ["8"], ["9"], ["10"]]; 2889 string fpath_data1x10 = buildPath(testDir, "data1x10.tsv"); 2890 string fpath_data1x10_noheader = buildPath(testDir, "data1x10_noheader.tsv"); 2891 writeUnittestTsvFile(fpath_data1x10, data1x10); 2892 writeUnittestTsvFile(fpath_data1x10_noheader, data1x10[1 .. $]); 2893 2894 string[][] data1x10ExpectedPermuteCompat = 2895 [["field_a"], ["8"], ["4"], ["6"], ["5"], ["3"], ["10"], ["7"], ["9"], ["2"], ["1"]]; 2896 2897 string[][] data1x10ExpectedPermuteWt1 = 2898 [["field_a"], ["8"], ["4"], ["6"], ["10"], ["5"], ["7"], ["9"], ["3"], ["2"], ["1"]]; 2899 2900 /* 2x10a - Uniform distribution [0,1]. */ 2901 string[][] data2x10a = 2902 [["line", "weight"], 2903 ["1", "0.26788837"], 2904 ["2", "0.06601298"], 2905 ["3", "0.38627527"], 2906 ["4", "0.47379424"], 2907 ["5", "0.02966641"], 2908 ["6", "0.05636231"], 2909 ["7", "0.70529242"], 2910 ["8", "0.91836862"], 2911 ["9", "0.99103720"], 2912 ["10", "0.31401740"]]; 2913 2914 string fpath_data2x10a = buildPath(testDir, "data2x10a.tsv"); 2915 writeUnittestTsvFile(fpath_data2x10a, data2x10a); 2916 2917 string[][] data2x10aExpectedPermuteWt2Probs = 2918 [["random_value", "line", "weight"], 2919 ["0.96833865494543658", "8", "0.91836862"], 2920 ["0.91856842054413923", "4", "0.47379424"], 2921 ["0.25730832087795091", "7", "0.70529242"], 2922 ["0.23725317907018120", "9", "0.99103720"], 2923 ["0.16016096701872204", "3", "0.38627527"], 2924 ["0.090819662667243381", "10", "0.31401740"], 2925 ["0.0071764539244361172", "6", "0.05636231"], 2926 ["0.000000048318642951630057", "1", "0.26788837"], 2927 ["0.00000000037525692966535517", "5", "0.02966641"], 2928 ["8.2123247880095796e-13", "2", "0.06601298"]]; 2929 2930 /* 2x10b - Uniform distribution [0,1000]. */ 2931 string[][] data2x10b = 2932 [["line", "weight"], 2933 ["1", "761"], 2934 ["2", "432"], 2935 ["3", "103"], 2936 ["4", "448"], 2937 ["5", "750"], 2938 ["6", "711"], 2939 ["7", "867"], 2940 ["8", "841"], 2941 ["9", "963"], 2942 ["10", "784"]]; 2943 2944 string fpath_data2x10b = buildPath(testDir, "data2x10b.tsv"); 2945 writeUnittestTsvFile(fpath_data2x10b, data2x10b); 2946 2947 string[][] data2x10bExpectedPermuteWt2Probs = 2948 [["random_value", "line", "weight"], 2949 ["0.99996486739067969", "8", "841"], 2950 ["0.99991017467137211", "4", "448"], 2951 ["0.99960871524873662", "6", "711"], 2952 ["0.99914188537143800", "5", "750"], 2953 ["0.99903963250274785", "10", "784"], 2954 ["0.99889631825931946", "7", "867"], 2955 ["0.99852058315191139", "9", "963"], 2956 ["0.99575669679158918", "2", "432"], 2957 ["0.99408758732050595", "1", "761"], 2958 ["0.99315467761212362", "3", "103"]]; 2959 2960 /* 2x10c - Logarithmic distribution in random order. */ 2961 string[][] data2x10c = 2962 [["line", "weight"], 2963 ["1", "31.85"], 2964 ["2", "17403.31"], 2965 ["3", "653.84"], 2966 ["4", "8.23"], 2967 ["5", "2671.04"], 2968 ["6", "26226.08"], 2969 ["7", "1.79"], 2970 ["8", "354.56"], 2971 ["9", "35213.81"], 2972 ["10", "679.29"]]; 2973 2974 string fpath_data2x10c = buildPath(testDir, "data2x10c.tsv"); 2975 writeUnittestTsvFile(fpath_data2x10c, data2x10c); 2976 2977 string[][] data2x10cExpectedPermuteWt2Probs = 2978 [["random_value", "line", "weight"], 2979 ["0.99998939008709697", "6", "26226.08"], 2980 ["0.99995951291695517", "9", "35213.81"], 2981 ["0.99991666907613541", "8", "354.56"], 2982 ["0.99989445052186410", "2", "17403.31"], 2983 ["0.99975897602861630", "5", "2671.04"], 2984 ["0.99891852769877643", "3", "653.84"], 2985 ["0.99889167752782515", "10", "679.29"], 2986 ["0.99512207506850148", "4", "8.23"], 2987 ["0.86789371584259023", "1", "31.85"], 2988 ["0.58574438162915610", "7", "1.79"]]; 2989 2990 /* 2x10d. Logarithmic distribution in ascending order. */ 2991 string[][] data2x10d = 2992 [["line", "weight"], 2993 ["1", "1.79"], 2994 ["2", "8.23"], 2995 ["3", "31.85"], 2996 ["4", "354.56"], 2997 ["5", "653.84"], 2998 ["6", "679.29"], 2999 ["7", "2671.04"], 3000 ["8", "17403.31"], 3001 ["9", "26226.08"], 3002 ["10", "35213.81"]]; 3003 3004 string fpath_data2x10d = buildPath(testDir, "data2x10d.tsv"); 3005 writeUnittestTsvFile(fpath_data2x10d, data2x10d); 3006 3007 string[][] data2x10dExpectedPermuteWt2Probs = 3008 [["random_value", "line", "weight"], 3009 ["0.99999830221846353", "8", "17403.31"], 3010 ["0.99997860834041397", "10", "35213.81"], 3011 ["0.99994563828986716", "9", "26226.08"], 3012 ["0.99988650363575737", "4", "354.56"], 3013 ["0.99964161939190088", "7", "2671.04"], 3014 ["0.99959045338948649", "6", "679.29"], 3015 ["0.99901574490639788", "5", "653.84"], 3016 ["0.97803163304747431", "3", "31.85"], 3017 ["0.79994791806910948", "2", "8.23"], 3018 ["0.080374261239949119", "1", "1.79"]]; 3019 3020 /* 2x10e. Logarithmic distribution in descending order. */ 3021 string[][] data2x10e = 3022 [["line", "weight"], 3023 ["1", "35213.81"], 3024 ["2", "26226.08"], 3025 ["3", "17403.31"], 3026 ["4", "2671.04"], 3027 ["5", "679.29"], 3028 ["6", "653.84"], 3029 ["7", "354.56"], 3030 ["8", "31.85"], 3031 ["9", "8.23"], 3032 ["10", "1.79"]]; 3033 string fpath_data2x10e = buildPath(testDir, "data2x10e.tsv"); 3034 writeUnittestTsvFile(fpath_data2x10e, data2x10e); 3035 3036 string[][] data2x10eExpectedPermuteWt2Probs = 3037 [["random_value", "line", "weight"], 3038 ["0.99998493348975237", "4", "2671.04"], 3039 ["0.99995934807202624", "3", "17403.31"], 3040 ["0.99992995739727453", "2", "26226.08"], 3041 ["0.99987185679245649", "1", "35213.81"], 3042 ["0.99957451563173938", "6", "653.84"], 3043 ["0.99907273650209583", "8", "31.85"], 3044 ["0.99905260312968946", "5", "679.29"], 3045 ["0.99730333650516401", "7", "354.56"], 3046 ["0.84093902435227808", "9", "8.23"], 3047 ["0.65650015926290028", "10", "1.79"]]; 3048 3049 /* Data sets for distinct sampling. */ 3050 string[][] data5x25 = 3051 [["ID", "Shape", "Color", "Size", "Weight"], 3052 ["01", "circle", "red", "S", "10"], 3053 ["02", "circle", "black", "L", "20"], 3054 ["03", "square", "black", "L", "20"], 3055 ["04", "circle", "green", "L", "30"], 3056 ["05", "ellipse", "red", "S", "20"], 3057 ["06", "triangle", "red", "S", "10"], 3058 ["07", "triangle", "red", "L", "20"], 3059 ["08", "square", "black", "S", "10"], 3060 ["09", "circle", "black", "S", "20"], 3061 ["10", "square", "green", "L", "20"], 3062 ["11", "triangle", "red", "L", "20"], 3063 ["12", "circle", "green", "L", "30"], 3064 ["13", "ellipse", "red", "S", "20"], 3065 ["14", "circle", "green", "L", "30"], 3066 ["15", "ellipse", "red", "L", "30"], 3067 ["16", "square", "red", "S", "10"], 3068 ["17", "circle", "black", "L", "20"], 3069 ["18", "square", "red", "S", "20"], 3070 ["19", "square", "black", "L", "20"], 3071 ["20", "circle", "red", "S", "10"], 3072 ["21", "ellipse", "black", "L", "30"], 3073 ["22", "triangle", "red", "L", "30"], 3074 ["23", "circle", "green", "S", "20"], 3075 ["24", "square", "green", "L", "20"], 3076 ["25", "circle", "red", "S", "10"], 3077 ]; 3078 3079 string fpath_data5x25 = buildPath(testDir, "data5x25.tsv"); 3080 string fpath_data5x25_noheader = buildPath(testDir, "data5x25_noheader.tsv"); 3081 writeUnittestTsvFile(fpath_data5x25, data5x25); 3082 writeUnittestTsvFile(fpath_data5x25_noheader, data5x25[1 .. $]); 3083 3084 string[][] data5x25ExpectedDistinctK2P40 = 3085 [["ID", "Shape", "Color", "Size", "Weight"], 3086 ["03", "square", "black", "L", "20"], 3087 ["05", "ellipse", "red", "S", "20"], 3088 ["08", "square", "black", "S", "10"], 3089 ["10", "square", "green", "L", "20"], 3090 ["13", "ellipse", "red", "S", "20"], 3091 ["15", "ellipse", "red", "L", "30"], 3092 ["16", "square", "red", "S", "10"], 3093 ["18", "square", "red", "S", "20"], 3094 ["19", "square", "black", "L", "20"], 3095 ["21", "ellipse", "black", "L", "30"], 3096 ["24", "square", "green", "L", "20"], 3097 ]; 3098 3099 string[][] data5x25ExpectedDistinctK2K4P20 = 3100 [["ID", "Shape", "Color", "Size", "Weight"], 3101 ["03", "square", "black", "L", "20"], 3102 ["07", "triangle", "red", "L", "20"], 3103 ["08", "square", "black", "S", "10"], 3104 ["10", "square", "green", "L", "20"], 3105 ["11", "triangle", "red", "L", "20"], 3106 ["16", "square", "red", "S", "10"], 3107 ["18", "square", "red", "S", "20"], 3108 ["19", "square", "black", "L", "20"], 3109 ["22", "triangle", "red", "L", "30"], 3110 ["24", "square", "green", "L", "20"], 3111 ]; 3112 3113 string[][] data5x25ExpectedDistinctK2K3K4P20 = 3114 [["ID", "Shape", "Color", "Size", "Weight"], 3115 ["04", "circle", "green", "L", "30"], 3116 ["07", "triangle", "red", "L", "20"], 3117 ["09", "circle", "black", "S", "20"], 3118 ["11", "triangle", "red", "L", "20"], 3119 ["12", "circle", "green", "L", "30"], 3120 ["14", "circle", "green", "L", "30"], 3121 ["16", "square", "red", "S", "10"], 3122 ["18", "square", "red", "S", "20"], 3123 ["22", "triangle", "red", "L", "30"], 3124 ]; 3125 3126 /* Fields 2 and 4 from data5x25. Distinct rows should be the same for equiv keys. */ 3127 string[][] data2x25 = 3128 [["Shape", "Size"], 3129 ["circle", "S"], 3130 ["circle", "L"], 3131 ["square", "L"], 3132 ["circle", "L"], 3133 ["ellipse", "S"], 3134 ["triangle", "S"], 3135 ["triangle", "L"], 3136 ["square", "S"], 3137 ["circle", "S"], 3138 ["square", "L"], 3139 ["triangle", "L"], 3140 ["circle", "L"], 3141 ["ellipse", "S"], 3142 ["circle", "L"], 3143 ["ellipse", "L"], 3144 ["square", "S"], 3145 ["circle", "L"], 3146 ["square", "S"], 3147 ["square", "L"], 3148 ["circle", "S"], 3149 ["ellipse", "L"], 3150 ["triangle", "L"], 3151 ["circle", "S"], 3152 ["square", "L"], 3153 ["circle", "S"], 3154 ]; 3155 3156 string fpath_data2x25 = buildPath(testDir, "data2x25.tsv"); 3157 string fpath_data2x25_noheader = buildPath(testDir, "data2x25_noheader.tsv"); 3158 writeUnittestTsvFile(fpath_data2x25, data2x25); 3159 writeUnittestTsvFile(fpath_data2x25_noheader, data2x25[1 .. $]); 3160 3161 string[][] data2x25ExpectedDistinctK1K2P20 = 3162 [["Shape", "Size"], 3163 ["square", "L"], 3164 ["triangle", "L"], 3165 ["square", "S"], 3166 ["square", "L"], 3167 ["triangle", "L"], 3168 ["square", "S"], 3169 ["square", "S"], 3170 ["square", "L"], 3171 ["triangle", "L"], 3172 ["square", "L"], 3173 ]; 3174 3175 string[][] data1x25 = 3176 [["Shape-Size"], 3177 ["circle-S"], 3178 ["circle-L"], 3179 ["square-L"], 3180 ["circle-L"], 3181 ["ellipse-S"], 3182 ["triangle-S"], 3183 ["triangle-L"], 3184 ["square-S"], 3185 ["circle-S"], 3186 ["square-L"], 3187 ["triangle-L"], 3188 ["circle-L"], 3189 ["ellipse-S"], 3190 ["circle-L"], 3191 ["ellipse-L"], 3192 ["square-S"], 3193 ["circle-L"], 3194 ["square-S"], 3195 ["square-L"], 3196 ["circle-S"], 3197 ["ellipse-L"], 3198 ["triangle-L"], 3199 ["circle-S"], 3200 ["square-L"], 3201 ["circle-S"], 3202 ]; 3203 3204 string fpath_data1x25 = buildPath(testDir, "data1x25.tsv"); 3205 string fpath_data1x25_noheader = buildPath(testDir, "data1x25_noheader.tsv"); 3206 writeUnittestTsvFile(fpath_data1x25, data1x25); 3207 writeUnittestTsvFile(fpath_data1x25_noheader, data1x25[1 .. $]); 3208 3209 string[][] data1x25ExpectedDistinctK1P20 = 3210 [["Shape-Size"], 3211 ["triangle-L"], 3212 ["square-S"], 3213 ["triangle-L"], 3214 ["ellipse-L"], 3215 ["square-S"], 3216 ["square-S"], 3217 ["ellipse-L"], 3218 ["triangle-L"], 3219 ]; 3220 3221 string[][] data1x25ExpectedDistinctK1P20Probs = 3222 [["random_value", "Shape-Size"], 3223 ["0", "triangle-L"], 3224 ["0", "square-S"], 3225 ["0", "triangle-L"], 3226 ["0", "ellipse-L"], 3227 ["0", "square-S"], 3228 ["0", "square-S"], 3229 ["0", "ellipse-L"], 3230 ["0", "triangle-L"], 3231 ]; 3232 3233 string[][] data1x25ExpectedDistinctK1P20ProbsInorder = 3234 [["random_value", "Shape-Size"], 3235 ["1", "circle-S"], 3236 ["4", "circle-L"], 3237 ["2", "square-L"], 3238 ["4", "circle-L"], 3239 ["2", "ellipse-S"], 3240 ["1", "triangle-S"], 3241 ["0", "triangle-L"], 3242 ["0", "square-S"], 3243 ["1", "circle-S"], 3244 ["2", "square-L"], 3245 ["0", "triangle-L"], 3246 ["4", "circle-L"], 3247 ["2", "ellipse-S"], 3248 ["4", "circle-L"], 3249 ["0", "ellipse-L"], 3250 ["0", "square-S"], 3251 ["4", "circle-L"], 3252 ["0", "square-S"], 3253 ["2", "square-L"], 3254 ["1", "circle-S"], 3255 ["0", "ellipse-L"], 3256 ["0", "triangle-L"], 3257 ["1", "circle-S"], 3258 ["2", "square-L"], 3259 ["1", "circle-S"], 3260 ]; 3261 3262 /* 3263 * Enough setup! Actually run some tests! 3264 */ 3265 3266 /* Shuffling tests. Headers, static seed, compatibility mode. With weights and without. */ 3267 testTsvSample(["test-a1", "--header", "--static-seed", "--compatibility-mode", fpath_dataEmpty], dataEmpty); 3268 testTsvSample(["test-a2", "--header", "--static-seed", "--compatibility-mode", fpath_data3x0], data3x0); 3269 testTsvSample(["test-a3", "-H", "-s", "--compatibility-mode", fpath_data3x1], data3x1); 3270 testTsvSample(["test-a4", "-H", "-s", "--compatibility-mode", fpath_data3x2], data3x2PermuteCompat); 3271 testTsvSample(["test-a5", "-H", "-s", "--compatibility-mode", fpath_data3x3], data3x3ExpectedPermuteCompat); 3272 testTsvSample(["test-a6", "-H", "-s", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat); 3273 testTsvSample(["test-a7", "-H", "-s", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs); 3274 testTsvSample(["test-a8", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3); 3275 testTsvSample(["test-a9", "-H", "-s", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 3276 testTsvSample(["test-a10", "-H", "--seed-value", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); 3277 testTsvSample(["test-a11", "-H", "-s", "-v", "41", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); 3278 testTsvSample(["test-a12", "-H", "-s", "-v", "0", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs); 3279 testTsvSample(["test-a13", "-H", "-v", "41", "-w", "3", "--print-random", fpath_data3x6], data3x6ExpectedPermuteWt3V41Probs); 3280 3281 /* Shuffling, without compatibility mode, or with both compatibility and printing. */ 3282 testTsvSample(["test-aa1", "--header", "--static-seed", fpath_dataEmpty], dataEmpty); 3283 testTsvSample(["test-aa2", "--header", "--static-seed", fpath_data3x0], data3x0); 3284 testTsvSample(["test-aa3", "-H", "-s", fpath_data3x1], data3x1); 3285 testTsvSample(["test-aa4", "-H", "-s", fpath_data3x2], data3x2PermuteShuffle); 3286 testTsvSample(["test-aa5", "-H", "-s", fpath_data3x3], data3x3ExpectedPermuteSwap); 3287 testTsvSample(["test-aa6", "-H", "-s", fpath_data3x6], data3x6ExpectedPermuteSwap); 3288 testTsvSample(["test-aa7", "-H", "-s", "--weight-field", "3", fpath_data3x6], data3x6ExpectedPermuteWt3); 3289 testTsvSample(["test-aa8", "-H", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteWt3Probs); 3290 testTsvSample(["test-aa9", "-H", "--seed-value", "41", "--print-random", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompatV41Probs); 3291 3292 /* Reservoir sampling using Algorithm R. 3293 * (Note: reservoirSamplingViaHeap is tested later in the length-based iteration loops.) 3294 */ 3295 testTsvSample(["test-aa10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty); 3296 testTsvSample(["test-aa11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty); 3297 testTsvSample(["test-aa12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x0], data3x0); 3298 testTsvSample(["test-aa13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x0], data3x0); 3299 testTsvSample(["test-aa14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x1], data3x1); 3300 testTsvSample(["test-aa15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x1], data3x1); 3301 testTsvSample(["test-aa16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6); 3302 testTsvSample(["test-aa17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6); 3303 testTsvSample(["test-aa18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", fpath_data3x6], data3x6ExpectedSampleAlgoRNum5); 3304 testTsvSample(["test-aa19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", fpath_data3x6], data3x6ExpectedSampleAlgoRNum4); 3305 testTsvSample(["test-aa20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", fpath_data3x6], data3x6ExpectedSampleAlgoRNum3); 3306 testTsvSample(["test-aa21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", fpath_data3x6], data3x6ExpectedSampleAlgoRNum2); 3307 testTsvSample(["test-aa22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", fpath_data3x6], data3x6ExpectedSampleAlgoRNum1); 3308 3309 /* Inorder versions of Algorithm R tests. */ 3310 testTsvSample(["test-ai10", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "1", "--inorder", fpath_dataEmpty], dataEmpty); 3311 testTsvSample(["test-ai11", "--prefer-algorithm-r", "--header", "--static-seed", "--num", "2", "--inorder", fpath_dataEmpty], dataEmpty); 3312 testTsvSample(["test-ai12", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "--inorder", fpath_data3x0], data3x0); 3313 testTsvSample(["test-ai13", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "--inorder", fpath_data3x0], data3x0); 3314 testTsvSample(["test-ai14", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "--inorder", fpath_data3x1], data3x1); 3315 testTsvSample(["test-ai15", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "-i", fpath_data3x1], data3x1); 3316 testTsvSample(["test-ai16", "--prefer-algorithm-r", "-H", "-s", "--num", "7", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6Inorder); 3317 testTsvSample(["test-ai17", "--prefer-algorithm-r", "-H", "-s", "--num", "6", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum6Inorder); 3318 testTsvSample(["test-ai18", "--prefer-algorithm-r", "-H", "-s", "--num", "5", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum5Inorder); 3319 testTsvSample(["test-ai19", "--prefer-algorithm-r", "-H", "-s", "--num", "4", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum4Inorder); 3320 testTsvSample(["test-ai20", "--prefer-algorithm-r", "-H", "-s", "--num", "3", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum3Inorder); 3321 testTsvSample(["test-ai21", "--prefer-algorithm-r", "-H", "-s", "--num", "2", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum2Inorder); 3322 testTsvSample(["test-ai22", "--prefer-algorithm-r", "-H", "-s", "--num", "1", "-i", fpath_data3x6], data3x6ExpectedSampleAlgoRNum1Inorder); 3323 3324 /* Bernoulli sampling cases. */ 3325 testTsvSample(["test-a14", "--header", "--static-seed", "--prob", "0.001", fpath_dataEmpty], dataEmpty); 3326 testTsvSample(["test-a15", "--header", "--static-seed", "--prob", "0.001", fpath_data3x0], data3x0); 3327 testTsvSample(["test-a16", "-H", "-s", "-p", "1.0", fpath_data3x1], data3x1); 3328 testTsvSample(["test-a17", "-H", "-s", "-p", "1.0", fpath_data3x6], data3x6); 3329 testTsvSample(["test-a18", "-H", "-p", "1.0", fpath_data3x6], data3x6); 3330 testTsvSample(["test-a19", "-H", "-s", "--prob", "1.0", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 3331 testTsvSample(["test-a20", "-H", "-s", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60); 3332 testTsvSample(["test-a21", "-H", "-s", "--prob", "0.60", fpath_data3x6], data3x6ExpectedBernoulliCompatP60); 3333 testTsvSample(["test-a22", "-H", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatP60V41Probs); 3334 3335 /* Bernoulli sampling with probabilities in skip sampling range or preferring skip sampling. */ 3336 testTsvSample(["test-ab1", "-H", "--seed-value", "333", "--prob", "0.01", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P01); 3337 testTsvSample(["test-ab2", "-H", "--seed-value", "333", "--prob", "0.02", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P02); 3338 testTsvSample(["test-ab3", "-H", "--seed-value", "333", "--prob", "0.03", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03); 3339 testTsvSample(["test-ab4", "-H", "--seed-value", "333", "--prob", "0.01", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P01); 3340 testTsvSample(["test-ab5", "-H", "--seed-value", "333", "--prob", "0.02", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P02); 3341 testTsvSample(["test-ab6", "-H", "--seed-value", "333", "--prob", "0.03", "--compatibility-mode", fpath_data1x200], data1x200ExpectedBernoulliCompatV333P03); 3342 testTsvSample(["test-ab7", "-H", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6], data3x6ExpectedBernoulliSkipP40); 3343 3344 /* Distinct sampling cases. */ 3345 testTsvSample(["test-a23", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_dataEmpty], dataEmpty); 3346 testTsvSample(["test-a24", "--header", "--static-seed", "--prob", "0.001", "--key-fields", "1", fpath_data3x0], data3x0); 3347 testTsvSample(["test-a25", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x1], data3x1); 3348 testTsvSample(["test-a26", "-H", "-s", "-p", "1.0", "-k", "2", fpath_data3x6], data3x6); 3349 testTsvSample(["test-a27", "-H", "-s", "-p", "0.6", "-k", "1,3", fpath_data3x6], data3x6ExpectedDistinctK1K3P60); 3350 3351 3352 /* Generating random weights. Use Bernoulli sampling test set at prob 100% for uniform sampling. 3353 * For weighted sampling, use the weighted cases, but with expected using the original ordering. 3354 */ 3355 testTsvSample(["test-a28", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 3356 testTsvSample(["test-a29", "-H", "-s", "--gen-random-inorder", fpath_data3x6], data3x6ExpectedBernoulliProbsP100); 3357 testTsvSample(["test-a30", "-H", "-s", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6], 3358 data3x6ExpectedWt3ProbsInorder); 3359 testTsvSample(["test-a31", "-H", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6], 3360 data3x6ExpectedWt3V41ProbsInorder); 3361 testTsvSample(["test-a32", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6], 3362 data3x6ExpectedDistinctK1K3P60Probs); 3363 testTsvSample(["test-a33", "-H", "-s", "-p", "0.6", "-k", "1,3", "--print-random", "--random-value-header", 3364 "custom_random_value_header", fpath_data3x6], data3x6ExpectedDistinctK1K3P60ProbsRVCustom); 3365 testTsvSample(["test-a34", "-H", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6], 3366 data3x6ExpectedDistinctK2P2ProbsInorder); 3367 3368 /* Simple random sampling with replacement. */ 3369 testTsvSample(["test-a35", "-H", "-s", "--replace", fpath_dataEmpty], dataEmpty); 3370 testTsvSample(["test-a36", "-H", "-s", "--replace", "--num", "3", fpath_dataEmpty], dataEmpty); 3371 testTsvSample(["test-a37", "-H", "-s", "--replace", fpath_data3x0], data3x0); 3372 testTsvSample(["test-a38", "-H", "-s", "--replace", "--num", "3", fpath_data3x0], data3x0); 3373 testTsvSample(["test-a39", "-H", "-s", "--replace", "--num", "3", fpath_data3x1], data3x1ExpectedReplaceNum3); 3374 testTsvSample(["test-a40", "-H", "-s", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10); 3375 testTsvSample(["test-a41", "-H", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6], data3x6ExpectedReplaceNum10V77); 3376 3377 /* Shuffling, compatibility mode, without headers. */ 3378 testTsvSample(["test-b1", "-s", "--compatibility-mode", fpath_data3x1_noheader], data3x1[1 .. $]); 3379 testTsvSample(["test-b2", "-s", "--compatibility-mode", fpath_data3x2_noheader], data3x2PermuteCompat[1 .. $]); 3380 testTsvSample(["test-b3", "-s", "--compatibility-mode", fpath_data3x3_noheader], data3x3ExpectedPermuteCompat[1 .. $]); 3381 testTsvSample(["test-b4", "-s", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1 .. $]); 3382 testTsvSample(["test-b5", "-s", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1 .. $]); 3383 testTsvSample(["test-b6", "-s", "--weight-field", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1 .. $]); 3384 testTsvSample(["test-b7", "-s", "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1 .. $]); 3385 testTsvSample(["test-b8", "-v", "41", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1 .. $]); 3386 testTsvSample(["test-b9", "-v", "41", "-w", "3", "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3V41Probs[1 .. $]); 3387 3388 /* Shuffling, no headers, without compatibility mode, or with printing and compatibility mode. */ 3389 testTsvSample(["test-bb1", "-s", fpath_data3x1_noheader], data3x1[1 .. $]); 3390 testTsvSample(["test-bb2", "-s", fpath_data3x2_noheader], data3x2PermuteShuffle[1 .. $]); 3391 testTsvSample(["test-bb3", "-s", fpath_data3x3_noheader], data3x3ExpectedPermuteSwap[1 .. $]); 3392 testTsvSample(["test-bb4", "-s", fpath_data3x6_noheader], data3x6ExpectedPermuteSwap[1 .. $]); 3393 testTsvSample(["test-bb5", "-s", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1 .. $]); 3394 testTsvSample(["test-bb6", "-s", "--print-random", "-w", "3", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1 .. $]); 3395 testTsvSample(["test-bb7", "-v", "41", "--print-random", "--compatibility-mode", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatV41Probs[1 .. $]); 3396 3397 /* Reservoir sampling using Algorithm R, no headers. */ 3398 testTsvSample(["test-ac10", "--prefer-algorithm-r", "--static-seed", "--num", "1", fpath_dataEmpty], dataEmpty); 3399 testTsvSample(["test-ac11", "--prefer-algorithm-r", "--static-seed", "--num", "2", fpath_dataEmpty], dataEmpty); 3400 testTsvSample(["test-ac14", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x1_noheader], data3x1[1 .. $]); 3401 testTsvSample(["test-ac15", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x1_noheader], data3x1[1 .. $]); 3402 testTsvSample(["test-ac16", "--prefer-algorithm-r", "-s", "--num", "7", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6[1 .. $]); 3403 testTsvSample(["test-ac17", "--prefer-algorithm-r", "-s", "--num", "6", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6[1 .. $]); 3404 testTsvSample(["test-ac18", "--prefer-algorithm-r", "-s", "--num", "5", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum5[1 .. $]); 3405 testTsvSample(["test-ac19", "--prefer-algorithm-r", "-s", "--num", "4", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum4[1 .. $]); 3406 testTsvSample(["test-ac20", "--prefer-algorithm-r", "-s", "--num", "3", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum3[1 .. $]); 3407 testTsvSample(["test-ac21", "--prefer-algorithm-r", "-s", "--num", "2", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum2[1 .. $]); 3408 testTsvSample(["test-ac22", "--prefer-algorithm-r", "-s", "--num", "1", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum1[1 .. $]); 3409 3410 /* Reservoir sampling using Algorithm R, no headers, inorder output. */ 3411 testTsvSample(["test-aj10", "--prefer-algorithm-r", "--static-seed", "--num", "1", "-i", fpath_dataEmpty], dataEmpty); 3412 testTsvSample(["test-aj11", "--prefer-algorithm-r", "--static-seed", "--num", "2", "-i", fpath_dataEmpty], dataEmpty); 3413 testTsvSample(["test-aj14", "--prefer-algorithm-r", "-s", "--num", "1", "-i", fpath_data3x1_noheader], data3x1[1 .. $]); 3414 testTsvSample(["test-aj15", "--prefer-algorithm-r", "-s", "--num", "2", "-i", fpath_data3x1_noheader], data3x1[1 .. $]); 3415 testTsvSample(["test-aj16", "--prefer-algorithm-r", "-s", "--num", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6Inorder[1 .. $]); 3416 testTsvSample(["test-aj17", "--prefer-algorithm-r", "-s", "--num", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum6Inorder[1 .. $]); 3417 testTsvSample(["test-aj18", "--prefer-algorithm-r", "-s", "--num", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum5Inorder[1 .. $]); 3418 testTsvSample(["test-aj19", "--prefer-algorithm-r", "-s", "--num", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum4Inorder[1 .. $]); 3419 testTsvSample(["test-aj20", "--prefer-algorithm-r", "-s", "--num", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum3Inorder[1 .. $]); 3420 testTsvSample(["test-aj21", "--prefer-algorithm-r", "-s", "--num", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum2Inorder[1 .. $]); 3421 testTsvSample(["test-aj22", "--prefer-algorithm-r", "-s", "--num", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleAlgoRNum1Inorder[1 .. $]); 3422 3423 /* Bernoulli sampling cases. */ 3424 testTsvSample(["test-b10", "-s", "-p", "1.0", fpath_data3x1_noheader], data3x1[1 .. $]); 3425 testTsvSample(["test-b11", "-s", "-p", "1.0", fpath_data3x6_noheader], data3x6[1 .. $]); 3426 testTsvSample(["test-b12", "-p", "1.0", fpath_data3x6_noheader], data3x6[1 .. $]); 3427 testTsvSample(["test-b13", "-s", "--prob", "1.0", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1 .. $]); 3428 testTsvSample(["test-b14", "-s", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1 .. $]); 3429 testTsvSample(["test-b15", "-v", "41", "--prob", "0.60", "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60V41Probs[1 .. $]); 3430 3431 /* Bernoulli sampling with probabilities in skip sampling range. */ 3432 testTsvSample(["test-bb1", "-v", "333", "-p", "0.01", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P01[1 .. $]); 3433 testTsvSample(["test-bb2", "-v", "333", "-p", "0.02", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P02[1 .. $]); 3434 testTsvSample(["test-bb3", "-v", "333", "-p", "0.03", fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1 .. $]); 3435 testTsvSample(["test-bb4", "-v", "333", "-p", "0.01", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P01[1 .. $]); 3436 testTsvSample(["test-bb5", "-v", "333", "-p", "0.02", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P02[1 .. $]); 3437 testTsvSample(["test-bb6", "-v", "333", "-p", "0.03", "--compatibility-mode", fpath_data1x200_noheader], data1x200ExpectedBernoulliCompatV333P03[1 .. $]); 3438 testTsvSample(["test-bb7", "-s", "-p", "0.40", "--prefer-skip-sampling", fpath_data3x6_noheader], data3x6ExpectedBernoulliSkipP40[1 .. $]); 3439 3440 /* Distinct sampling cases. */ 3441 testTsvSample(["test-b16", "-s", "-p", "1.0", "-k", "2", fpath_data3x1_noheader], data3x1[1 .. $]); 3442 testTsvSample(["test-b17", "-s", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]); 3443 testTsvSample(["test-b18", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]); 3444 testTsvSample(["test-b19", "-v", "71563", "-p", "1.0", "-k", "2", fpath_data3x6_noheader], data3x6[1 .. $]); 3445 3446 /* Generating random weights. Reuse Bernoulli sampling tests at prob 100%. */ 3447 testTsvSample(["test-b20", "-s", "--gen-random-inorder", fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1 .. $]); 3448 testTsvSample(["test-b23", "-v", "41", "--gen-random-inorder", "--weight-field", "3", fpath_data3x6_noheader], data3x6ExpectedWt3V41ProbsInorder[1 .. $]); 3449 testTsvSample(["test-b24", "-s", "-p", "0.6", "-k", "1,3", "--print-random", fpath_data3x6_noheader], 3450 data3x6ExpectedDistinctK1K3P60Probs[1 .. $]); 3451 testTsvSample(["test-b24", "-s", "-p", "0.2", "-k", "2", "--gen-random-inorder", fpath_data3x6_noheader], 3452 data3x6ExpectedDistinctK2P2ProbsInorder[1 .. $]); 3453 3454 /* Simple random sampling with replacement. */ 3455 testTsvSample(["test-b25", "-s", "--replace", fpath_dataEmpty], dataEmpty); 3456 testTsvSample(["test-b26", "-s", "-r", "--num", "3", fpath_dataEmpty], dataEmpty); 3457 testTsvSample(["test-b27", "-s", "-r", "-n", "3", fpath_data3x1_noheader], data3x1ExpectedReplaceNum3[1 .. $]); 3458 testTsvSample(["test-b28", "-s", "--replace", "-n", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10[1 .. $]); 3459 testTsvSample(["test-b29", "-s", "-v", "77", "--replace", "--num", "10", fpath_data3x6_noheader], data3x6ExpectedReplaceNum10V77[1 .. $]); 3460 3461 /* Multi-file tests. */ 3462 testTsvSample(["test-c1", "--header", "--static-seed", "--compatibility-mode", 3463 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3464 combo1ExpectedPermuteCompat); 3465 testTsvSample(["test-c2", "--header", "--static-seed", "--print-random", 3466 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3467 combo1ExpectedPermuteCompatProbs); 3468 testTsvSample(["test-c3", "--header", "--static-seed", "--print-random", "--weight-field", "3", 3469 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3470 combo1ExpectedPermuteWt3Probs); 3471 testTsvSample(["test-c4", "--header", "--static-seed", "--weight-field", "3", "--compatibility-mode", 3472 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3473 combo1ExpectedPermuteWt3); 3474 testTsvSample(["test-c5", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4", 3475 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3476 combo1ExpectedSampleAlgoRNum4); 3477 testTsvSample(["test-c5b", "--header", "--static-seed", "--prefer-algorithm-r", "--num", "4", "--inorder", 3478 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3479 combo1ExpectedSampleAlgoRNum4Inorder); 3480 3481 /* Multi-file, no headers. */ 3482 testTsvSample(["test-c6", "--static-seed", "--compatibility-mode", 3483 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3484 fpath_data3x6_noheader, fpath_data3x2_noheader], 3485 combo1ExpectedPermuteCompat[1 .. $]); 3486 testTsvSample(["test-c7", "--static-seed", "--print-random", 3487 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3488 fpath_data3x6_noheader, fpath_data3x2_noheader], 3489 combo1ExpectedPermuteCompatProbs[1 .. $]); 3490 testTsvSample(["test-c8", "--static-seed", "--print-random", "--weight-field", "3", 3491 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3492 fpath_data3x6_noheader, fpath_data3x2_noheader], 3493 combo1ExpectedPermuteWt3Probs[1 .. $]); 3494 testTsvSample(["test-c9", "--static-seed", "--weight-field", "3", "--compatibility-mode", 3495 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3496 fpath_data3x6_noheader, fpath_data3x2_noheader], 3497 combo1ExpectedPermuteWt3[1 .. $]); 3498 testTsvSample(["test-c10", "--static-seed", "--prefer-algorithm-r", "--num", "4", 3499 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3500 fpath_data3x6_noheader, fpath_data3x2_noheader], 3501 combo1ExpectedSampleAlgoRNum4[1 .. $]); 3502 testTsvSample(["test-c10b", "--static-seed", "--prefer-algorithm-r", "--num", "4", "--inorder", 3503 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3504 fpath_data3x6_noheader, fpath_data3x2_noheader], 3505 combo1ExpectedSampleAlgoRNum4Inorder[1 .. $]); 3506 3507 /* Bernoulli sampling cases. */ 3508 testTsvSample(["test-c11", "--header", "--static-seed", "--print-random", "--prob", ".5", 3509 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3510 combo1ExpectedBernoulliCompatP50Probs); 3511 testTsvSample(["test-c12", "--header", "--static-seed", "--prob", ".4", 3512 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3513 combo1ExpectedBernoulliCompatP40); 3514 testTsvSample(["test-c13", "--static-seed", "--print-random", "--prob", ".5", 3515 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3516 fpath_data3x6_noheader, fpath_data3x2_noheader], 3517 combo1ExpectedBernoulliCompatP50Probs[1 .. $]); 3518 testTsvSample(["test-c14", "--static-seed", "--prob", ".4", 3519 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3520 fpath_data3x6_noheader, fpath_data3x2_noheader], 3521 combo1ExpectedBernoulliCompatP40[1 .. $]); 3522 3523 /* Bernoulli sampling with probabilities in skip sampling range. */ 3524 testTsvSample(["test-cc1", "-H", "-v", "333", "-p", "0.03", 3525 fpath_data3x0, fpath_data3x1, fpath_data1x200, fpath_dataEmpty, fpath_data1x10], 3526 combo2ExpectedBernoulliSkipV333P03); 3527 testTsvSample(["test-cc1", "-v", "333", "-p", "0.03", 3528 fpath_data3x1_noheader, fpath_data1x200_noheader, fpath_dataEmpty, fpath_data1x10_noheader], 3529 combo2ExpectedBernoulliSkipV333P03[1 .. $]); 3530 3531 /* Distinct sampling cases. */ 3532 testTsvSample(["test-c13", "--header", "--static-seed", "--key-fields", "1", "--prob", ".4", 3533 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3534 combo1ExpectedDistinctK1P40); 3535 testTsvSample(["test-c14", "--static-seed", "--key-fields", "1", "--prob", ".4", 3536 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3537 fpath_data3x6_noheader, fpath_data3x2_noheader], 3538 combo1ExpectedDistinctK1P40[1 .. $]); 3539 3540 /* Generating random weights. */ 3541 testTsvSample(["test-c15", "--header", "--static-seed", "--gen-random-inorder", 3542 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3543 combo1ExpectedProbsInorder); 3544 testTsvSample(["test-c16", "--static-seed", "--gen-random-inorder", 3545 fpath_data3x3_noheader, fpath_data3x1_noheader, 3546 fpath_dataEmpty, fpath_data3x6_noheader, fpath_data3x2_noheader], 3547 combo1ExpectedProbsInorder[1 .. $]); 3548 3549 /* Simple random sampling with replacement. */ 3550 testTsvSample(["test-c17", "--header", "--static-seed", "--replace", "--num", "10", 3551 fpath_data3x0, fpath_data3x3, fpath_data3x1, fpath_dataEmpty, fpath_data3x6, fpath_data3x2], 3552 combo1ExpectedReplaceNum10); 3553 3554 testTsvSample(["test-c18", "--static-seed", "--replace", "--num", "10", 3555 fpath_data3x3_noheader, fpath_data3x1_noheader, fpath_dataEmpty, 3556 fpath_data3x6_noheader, fpath_data3x2_noheader], 3557 combo1ExpectedReplaceNum10[1 .. $]); 3558 3559 /* Single column file. */ 3560 testTsvSample(["test-d1", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat); 3561 testTsvSample(["test-d2", "-H", "-s", "--compatibility-mode", fpath_data1x10], data1x10ExpectedPermuteCompat); 3562 3563 /* Distributions. */ 3564 testTsvSample(["test-e1", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10a], data2x10aExpectedPermuteWt2Probs); 3565 testTsvSample(["test-e2", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10b], data2x10bExpectedPermuteWt2Probs); 3566 testTsvSample(["test-e3", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10c], data2x10cExpectedPermuteWt2Probs); 3567 testTsvSample(["test-e4", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10d], data2x10dExpectedPermuteWt2Probs); 3568 testTsvSample(["test-e5", "-H", "-s", "-w", "2", "--print-random", fpath_data2x10e], data2x10eExpectedPermuteWt2Probs); 3569 3570 /* Tests of subset sample (--n|num) field. Random sampling, Bernoulli sampling, distinct sampling. 3571 * 3572 * Note: The way these tests are done ensures that subset length does not affect 3573 * output order. 3574 */ 3575 import std.algorithm : min; 3576 for (size_t n = data3x6.length + 2; n >= 1; n--) 3577 { 3578 /* reservoirSamplingViaHeap. 3579 */ 3580 size_t expectedLength = min(data3x6.length, n + 1); 3581 testTsvSample([format("test-f1_%d", n), "-s", "-n", n.to!string, 3582 "-H", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]); 3583 3584 testTsvSample([format("test-f2_%d", n), "-s", "-n", n.to!string, 3585 "-H", "--compatibility-mode", fpath_data3x6], data3x6ExpectedPermuteCompat[0..expectedLength]); 3586 3587 testTsvSample([format("test-f3_%d", n), "-s", "-n", n.to!string, 3588 "-H", "--print-random", fpath_data3x6], data3x6ExpectedPermuteCompatProbs[0..expectedLength]); 3589 3590 testTsvSample([format("test-f4_%d", n), "-s", "-n", n.to!string, 3591 "-H", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3[0..expectedLength]); 3592 3593 testTsvSample([format("test-f5_%d", n), "-s", "-n", n.to!string, 3594 "-H", "--print-random", "-w", "3", fpath_data3x6], data3x6ExpectedPermuteWt3Probs[0..expectedLength]); 3595 3596 testTsvSample([format("test-f6_%d", n), "-s", "-n", n.to!string, 3597 fpath_data3x6_noheader], data3x6ExpectedPermuteCompat[1..expectedLength]); 3598 3599 testTsvSample([format("test-f7_%d", n), "-s", "-n", n.to!string, 3600 "--print-random", fpath_data3x6_noheader], data3x6ExpectedPermuteCompatProbs[1..expectedLength]); 3601 3602 testTsvSample([format("test-f8_%d", n), "-s", "-n", n.to!string, 3603 "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3[1..expectedLength]); 3604 3605 testTsvSample([format("test-f9_%d", n), "-s", "-n", n.to!string, 3606 "--print-random", "-w", "3", fpath_data3x6_noheader], data3x6ExpectedPermuteWt3Probs[1..expectedLength]); 3607 3608 /* Bernoulli sampling. 3609 */ 3610 import std.algorithm : min; 3611 size_t sampleExpectedLength = min(expectedLength, data3x6ExpectedBernoulliCompatProbsP60.length); 3612 3613 testTsvSample([format("test-f10_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 3614 "-H", "--print-random", fpath_data3x6], data3x6ExpectedBernoulliCompatProbsP60[0..sampleExpectedLength]); 3615 3616 testTsvSample([format("test-f11_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 3617 "-H", fpath_data3x6], data3x6ExpectedBernoulliCompatP60[0..sampleExpectedLength]); 3618 3619 testTsvSample([format("test-f12_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 3620 "--print-random", fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatProbsP60[1..sampleExpectedLength]); 3621 3622 testTsvSample([format("test-f13_%d", n), "-s", "-p", "0.6", "-n", n.to!string, 3623 fpath_data3x6_noheader], data3x6ExpectedBernoulliCompatP60[1..sampleExpectedLength]); 3624 3625 /* Distinct Sampling. 3626 */ 3627 size_t distinctExpectedLength = min(expectedLength, data3x6ExpectedDistinctK1K3P60.length); 3628 3629 testTsvSample([format("test-f14_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string, 3630 "-H", fpath_data3x6], data3x6ExpectedDistinctK1K3P60[0..distinctExpectedLength]); 3631 3632 testTsvSample([format("test-f15_%d", n), "-s", "-k", "1,3", "-p", "0.6", "-n", n.to!string, 3633 fpath_data3x6_noheader], data3x6ExpectedDistinctK1K3P60[1..distinctExpectedLength]); 3634 3635 testTsvSample([format("test-f16_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string, 3636 "-H", fpath_data3x6], data3x6ExpectedBernoulliProbsP100[0..expectedLength]); 3637 3638 testTsvSample([format("test-f17_%d", n), "-s", "--gen-random-inorder", "-n", n.to!string, 3639 fpath_data3x6_noheader], data3x6ExpectedBernoulliProbsP100[1..expectedLength]); 3640 } 3641 3642 /* Similar tests with the 1x10 data set. */ 3643 for (size_t n = data1x10.length + 2; n >= 1; n--) 3644 { 3645 size_t expectedLength = min(data1x10.length, n + 1); 3646 testTsvSample([format("test-g1_%d", n), "-s", "-n", n.to!string, 3647 "-H", fpath_data1x10], data1x10ExpectedPermuteCompat[0..expectedLength]); 3648 3649 testTsvSample([format("test-g2_%d", n), "-s", "-n", n.to!string, 3650 "-H", "-w", "1", fpath_data1x10], data1x10ExpectedPermuteWt1[0..expectedLength]); 3651 3652 testTsvSample([format("test-g3_%d", n), "-s", "-n", n.to!string, 3653 fpath_data1x10_noheader], data1x10ExpectedPermuteCompat[1..expectedLength]); 3654 3655 testTsvSample([format("test-g4_%d", n), "-s", "-n", n.to!string, 3656 "-w", "1", fpath_data1x10_noheader], data1x10ExpectedPermuteWt1[1..expectedLength]); 3657 } 3658 3659 /* Simple random sampling with replacement: ensure sample size doesn't change order. */ 3660 for (size_t n = data3x6ExpectedReplaceNum10.length - 1; n >= 1; n--) 3661 { 3662 testTsvSample([format("test-h1_%d", n), "-s", "--replace", "-n", n.to!string, "-H", fpath_data3x6], 3663 data3x6ExpectedReplaceNum10[0 .. n + 1]); 3664 3665 testTsvSample([format("test-h2_%d", n), "-s", "--replace", "-n", n.to!string, fpath_data3x6_noheader], 3666 data3x6ExpectedReplaceNum10[1 .. n + 1]); 3667 } 3668 3669 /* Bernoulli skip sampling. Test with lengths both greater than and less than expected. */ 3670 for (size_t n = data1x200ExpectedBernoulliSkipV333P03.length + 2; n >= 1; n--) 3671 { 3672 size_t expectedLength = min(data1x200ExpectedBernoulliSkipV333P03.length, n + 1); 3673 3674 testTsvSample([format("test-i1_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string, 3675 "-H", fpath_data1x200], data1x200ExpectedBernoulliSkipV333P03[0..expectedLength]); 3676 3677 testTsvSample([format("test-i2_%d", n), "-v", "333", "-p", "0.03", "-n", n.to!string, 3678 fpath_data1x200_noheader], data1x200ExpectedBernoulliSkipV333P03[1..expectedLength]); 3679 } 3680 3681 /* Inorder sampling tests using reservoir sampling via heap (compatibility mode). */ 3682 testTsvSample(["test-ar10", "--compatibility-mode", "--header", "--static-seed", "--num", "1", "--inorder", fpath_dataEmpty], dataEmpty); 3683 testTsvSample(["test-ar11", "--compatibility-mode", "--header", "--static-seed", "--num", "2", "--inorder", fpath_dataEmpty], dataEmpty); 3684 testTsvSample(["test-ar12", "--compatibility-mode", "-H", "-s", "--num", "1", "--inorder", fpath_data3x0], data3x0); 3685 testTsvSample(["test-ar13", "--compatibility-mode", "-H", "-s", "--num", "2", "--inorder", fpath_data3x0], data3x0); 3686 testTsvSample(["test-ar14", "--compatibility-mode", "-H", "-s", "--num", "1", "--inorder", fpath_data3x1], data3x1); 3687 testTsvSample(["test-ar15", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", fpath_data3x1], data3x1); 3688 testTsvSample(["test-ar16", "--compatibility-mode", "-H", "-s", "--num", "7", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum6Inorder); 3689 testTsvSample(["test-ar17", "--compatibility-mode", "-H", "-s", "--num", "6", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum6Inorder); 3690 testTsvSample(["test-ar18", "--compatibility-mode", "-H", "-s", "--num", "5", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum5Inorder); 3691 testTsvSample(["test-ar19", "--compatibility-mode", "-H", "-s", "--num", "4", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum4Inorder); 3692 testTsvSample(["test-ar20", "--compatibility-mode", "-H", "-s", "--num", "3", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum3Inorder); 3693 testTsvSample(["test-ar21", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum2Inorder); 3694 testTsvSample(["test-ar22", "--compatibility-mode", "-H", "-s", "--num", "1", "-i", fpath_data3x6], data3x6ExpectedSampleCompatNum1Inorder); 3695 3696 testTsvSample(["test-as10", "--compatibility-mode", "--static-seed", "--num", "1", "-i", fpath_dataEmpty], dataEmpty); 3697 testTsvSample(["test-as11", "--compatibility-mode", "--static-seed", "--num", "2", "-i", fpath_dataEmpty], dataEmpty); 3698 testTsvSample(["test-as14", "--compatibility-mode", "-s", "--num", "1", "-i", fpath_data3x1_noheader], data3x1[1 .. $]); 3699 testTsvSample(["test-as15", "--compatibility-mode", "-s", "--num", "2", "-i", fpath_data3x1_noheader], data3x1[1 .. $]); 3700 testTsvSample(["test-as16", "--compatibility-mode", "-s", "--num", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6Inorder[1 .. $]); 3701 testTsvSample(["test-as17", "--compatibility-mode", "-s", "--num", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6Inorder[1 .. $]); 3702 testTsvSample(["test-as18", "--compatibility-mode", "-s", "--num", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum5Inorder[1 .. $]); 3703 testTsvSample(["test-as19", "--compatibility-mode", "-s", "--num", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4Inorder[1 .. $]); 3704 testTsvSample(["test-as20", "--compatibility-mode", "-s", "--num", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum3Inorder[1 .. $]); 3705 testTsvSample(["test-as21", "--compatibility-mode", "-s", "--num", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum2Inorder[1 .. $]); 3706 testTsvSample(["test-as22", "--compatibility-mode", "-s", "--num", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum1Inorder[1 .. $]); 3707 3708 /* Inorder sampling tests with random number printing. --compatibility-mode not needed. */ 3709 testTsvSample(["test-at16", "--compatibility-mode", "-H", "-s", "--num", "7", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum6ProbsInorder); 3710 testTsvSample(["test-at17", "--compatibility-mode", "-H", "-s", "--num", "6", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum6ProbsInorder); 3711 testTsvSample(["test-at18", "--compatibility-mode", "-H", "-s", "--num", "5", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum5ProbsInorder); 3712 testTsvSample(["test-at19", "--compatibility-mode", "-H", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum4ProbsInorder); 3713 testTsvSample(["test-at19", "-H", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum4ProbsInorder); 3714 testTsvSample(["test-at20", "--compatibility-mode", "-H", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum3ProbsInorder); 3715 testTsvSample(["test-at20", "-H", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum3ProbsInorder); 3716 testTsvSample(["test-at21", "--compatibility-mode", "-H", "-s", "--num", "2", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum2ProbsInorder); 3717 testTsvSample(["test-at22", "--compatibility-mode", "-H", "-s", "--num", "1", "-i", "--print-random", fpath_data3x6], data3x6ExpectedSampleCompatNum1ProbsInorder); 3718 3719 testTsvSample(["test-au16", "--compatibility-mode", "-s", "--num", "7", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6ProbsInorder[1 .. $]); 3720 testTsvSample(["test-au17", "--compatibility-mode", "-s", "--num", "6", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum6ProbsInorder[1 .. $]); 3721 testTsvSample(["test-au18", "--compatibility-mode", "-s", "--num", "5", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum5ProbsInorder[1 .. $]); 3722 testTsvSample(["test-au19", "--compatibility-mode", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4ProbsInorder[1 .. $]); 3723 testTsvSample(["test-au19", "-s", "--num", "4", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum4ProbsInorder[1 .. $]); 3724 testTsvSample(["test-au20", "--compatibility-mode", "-s", "--num", "3", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum3ProbsInorder[1 .. $]); 3725 testTsvSample(["test-au21", "--compatibility-mode", "-s", "--num", "2", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum2ProbsInorder[1 .. $]); 3726 testTsvSample(["test-au22", "--compatibility-mode", "-s", "--num", "1", "-i", "--print-random", fpath_data3x6_noheader], data3x6ExpectedSampleCompatNum1ProbsInorder[1 .. $]); 3727 3728 /* Inorder weighted sampling tests. */ 3729 testTsvSample(["test-ax16", "-H", "-s", "-n", "7", "-i", fpath_data3x6], data3x6ExpectedWt3Num6Inorder); 3730 testTsvSample(["test-ax17", "-H", "-s", "-n", "6", "-i", fpath_data3x6], data3x6ExpectedWt3Num6Inorder); 3731 testTsvSample(["test-ax18", "-H", "-s", "-n", "5", "-i", fpath_data3x6], data3x6ExpectedWt3Num5Inorder); 3732 testTsvSample(["test-ax19", "-H", "-s", "-n", "4", "-i", fpath_data3x6], data3x6ExpectedWt3Num4Inorder); 3733 testTsvSample(["test-ax20", "-H", "-s", "-n", "3", "-i", fpath_data3x6], data3x6ExpectedWt3Num3Inorder); 3734 testTsvSample(["test-ax21", "-H", "-s", "-n", "2", "-i", fpath_data3x6], data3x6ExpectedWt3Num2Inorder); 3735 testTsvSample(["test-ax22", "-H", "-s", "-n", "1", "-i", fpath_data3x6], data3x6ExpectedWt3Num1Inorder); 3736 3737 testTsvSample(["test-ay16", "-s", "-n", "7", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num6Inorder[1 .. $]); 3738 testTsvSample(["test-ay17", "-s", "-n", "6", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num6Inorder[1 .. $]); 3739 testTsvSample(["test-ay18", "-s", "-n", "5", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num5Inorder[1 .. $]); 3740 testTsvSample(["test-ay19", "-s", "-n", "4", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num4Inorder[1 .. $]); 3741 testTsvSample(["test-ay20", "-s", "-n", "3", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num3Inorder[1 .. $]); 3742 testTsvSample(["test-ay21", "-s", "-n", "2", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num2Inorder[1 .. $]); 3743 testTsvSample(["test-ay22", "-s", "-n", "1", "-i", fpath_data3x6_noheader], data3x6ExpectedWt3Num1Inorder[1 .. $]); 3744 3745 /* 3746 * Distinct sampling tests. 3747 */ 3748 testTsvSample(["test-j1", "--header", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25], 3749 data5x25ExpectedDistinctK2P40); 3750 3751 testTsvSample(["test-j2", "-H", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25], 3752 data5x25ExpectedDistinctK2K4P20); 3753 3754 testTsvSample(["test-j3", "-H", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25], 3755 data5x25ExpectedDistinctK2K3K4P20); 3756 3757 testTsvSample(["test-j4", "--static-seed", "--prob", "0.40", "--key-fields", "2", fpath_data5x25_noheader], 3758 data5x25ExpectedDistinctK2P40[1 .. $]); 3759 3760 testTsvSample(["test-j5", "-s", "-p", "0.20", "-k", "2,4", fpath_data5x25_noheader], 3761 data5x25ExpectedDistinctK2K4P20[1 .. $]); 3762 3763 testTsvSample(["test-j6", "-s", "-p", "0.20", "-k", "2-4", fpath_data5x25_noheader], 3764 data5x25ExpectedDistinctK2K3K4P20[1 .. $]); 3765 3766 3767 /* These distinct tests check that the whole line as '-k 0' and specifying all fields 3768 * in order have the same result. Also that field numbers don't matter, as '-k 1,2' 3769 * in data2x25 are the same keys as '-k 2,4' in data5x25. 3770 */ 3771 testTsvSample(["test-j7", "-H", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25], 3772 data2x25ExpectedDistinctK1K2P20); 3773 3774 testTsvSample(["test-j8", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data2x25], 3775 data2x25ExpectedDistinctK1K2P20); 3776 3777 testTsvSample(["test-j9", "-s", "-p", "0.20", "-k", "1,2", fpath_data2x25_noheader], 3778 data2x25ExpectedDistinctK1K2P20[1 .. $]); 3779 3780 testTsvSample(["test-j10", "-s", "-p", "0.20", "-k", "0", fpath_data2x25_noheader], 3781 data2x25ExpectedDistinctK1K2P20[1 .. $]); 3782 3783 /* Similar to the last set, but for a 1-column file. Also with random value printing. */ 3784 testTsvSample(["test-j11", "-H", "-s", "-p", "0.20", "-k", "1", fpath_data1x25], 3785 data1x25ExpectedDistinctK1P20); 3786 3787 testTsvSample(["test-j12", "-H", "-s", "-p", "0.20", "-k", "0", fpath_data1x25], 3788 data1x25ExpectedDistinctK1P20); 3789 3790 testTsvSample(["test-j13", "-s", "-p", "0.20", "-k", "1", fpath_data1x25_noheader], 3791 data1x25ExpectedDistinctK1P20[1 .. $]); 3792 3793 testTsvSample(["test-j14", "-s", "-p", "0.20", "-k", "0", fpath_data1x25_noheader], 3794 data1x25ExpectedDistinctK1P20[1 .. $]); 3795 3796 3797 testTsvSample(["test-j15", "-H", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25], 3798 data1x25ExpectedDistinctK1P20Probs); 3799 3800 testTsvSample(["test-j16", "-H", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25], 3801 data1x25ExpectedDistinctK1P20Probs); 3802 3803 testTsvSample(["test-j17", "-s", "-p", "0.20", "-k", "1", "--print-random", fpath_data1x25_noheader], 3804 data1x25ExpectedDistinctK1P20Probs[1 .. $]); 3805 3806 testTsvSample(["test-j18", "-s", "-p", "0.20", "-k", "0", "--print-random", fpath_data1x25_noheader], 3807 data1x25ExpectedDistinctK1P20Probs[1 .. $]); 3808 3809 3810 testTsvSample(["test-j19", "-H", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25], 3811 data1x25ExpectedDistinctK1P20ProbsInorder); 3812 3813 testTsvSample(["test-j20", "-H", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25], 3814 data1x25ExpectedDistinctK1P20ProbsInorder); 3815 3816 testTsvSample(["test-j21", "-s", "-p", "0.20", "-k", "1", "--gen-random-inorder", fpath_data1x25_noheader], 3817 data1x25ExpectedDistinctK1P20ProbsInorder[1 .. $]); 3818 3819 testTsvSample(["test-j22", "-s", "-p", "0.20", "-k", "0", "--gen-random-inorder", fpath_data1x25_noheader], 3820 data1x25ExpectedDistinctK1P20ProbsInorder[1 .. $]); 3821 3822 }